From a7720235ba33cdc5eb7bf0f23bcc822d6d3c44ea Mon Sep 17 00:00:00 2001
From: Glenn Proctor <gp1@sanger.ac.uk>
Date: Wed, 27 May 2009 12:49:00 +0000
Subject: [PATCH] Use the translations of the _canonical_ transcripts of the
 source and target genes as the source of GO terms to project, rather than
 simply the longest translation in each, which was a bit arbitrary and didn't
 alway work.

Also make the print option for GO term projection a bit more verbose - aids debugging.
---
 .../xref_projection/project_display_xrefs.pl  | 33 ++++++++-----------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/misc-scripts/xref_projection/project_display_xrefs.pl b/misc-scripts/xref_projection/project_display_xrefs.pl
index 40b9aa847e..4e05ad3c98 100644
--- a/misc-scripts/xref_projection/project_display_xrefs.pl
+++ b/misc-scripts/xref_projection/project_display_xrefs.pl
@@ -346,12 +346,11 @@ sub project_go_terms {
   my ($to_ga, $to_dbea, $ma, $from_gene, $to_gene) = @_;
 
   # GO xrefs are linked to translations, not genes
-  # For historical reasons we only project GO terms between the longest translations of each gene
-  # TODO - consider projecting *all* GO terms from *all* source translations to one translation of target?
-  # TODO - getting the translation's length seem to involve lots of database accesses - some way to do
-  # this quicker? Via SQL?
-  my $from_translation = get_longest_translation($from_gene);
-  my $to_translation   = get_longest_translation($to_gene);
+  # Project GO terms between the translations of the canonical transcripts of each gene
+  my $from_translation = get_canonical_translation($from_gene);
+  my $to_translation   = get_canonical_translation($to_gene);
+
+  return if (!$from_translation || !$to_translation);
 
   my $from_latin_species = ucfirst(Bio::EnsEMBL::Registry->get_alias($from_species));
 
@@ -384,7 +383,7 @@ sub project_go_terms {
 
     $to_translation->add_DBEntry($dbEntry);
 
-    print $to_translation->stable_id() . " --> " . $dbEntry->display_id() . "\n" if ($print);
+    print $from_gene->stable_id() . " " . $from_translation->stable_id() . " " .  $dbEntry->display_id() . " --> " . $to_gene->stable_id() . " " . $to_translation->stable_id() . "\n" if ($print);
 
     $to_dbea->store($dbEntry, $to_translation->dbID(), 'Translation', 1) if (!$print);
 
@@ -735,26 +734,20 @@ sub homology_type_allowed {
 }
 
 # ----------------------------------------------------------------------
+# Get the translation associated with the gene's canonical transcript
 
-sub get_longest_translation {
+sub get_canonical_translation {
 
   my $gene = shift;
 
-  my $longest_translation;
-  my $max_length = -1;
-
-  foreach my $transcript (@{$gene->get_all_Transcripts()}) {
-
-    my $translation = $transcript->translation();
-    if ($translation && $translation->length() > $max_length) {
-      $longest_translation = $translation;
-    }
+  my $canonical_transcript = $gene->canonical_transcript();
 
+  if (!$canonical_transcript) {
+    warn("Can't get canonical transcript for " . $gene->stable_id() . ", skipping this homology");
+    return undef;
   }
 
-  warn("Can't find longest translation for " . $gene->stable_id()) if (!$longest_translation);
-
-  return $longest_translation;
+  return $canonical_transcript->translation();;
 
 }
 
-- 
GitLab