moved routines related to storing of final genes into Gene package, added...

moved routines related to storing of final genes into Gene package, added splitting of v. long introns, clustering of transcripts into genes, removal of duplicate transcripts and brinnging across xrefs

moved routines related to storing of final genes into Gene package, added...
moved routines related to storing of final genes into Gene package, added splitting of v. long introns, clustering of transcripts into genes, removal of duplicate transcripts and brinnging across xrefs
d49074d9 · Graham McVicker · d49c7a3c · d49074d9 · d49074d9 · d49074d9
Commit d49074d9 authored 21 years ago by Graham McVicker
--- a/misc-scripts/chimp/Gene.pm
+++ b/misc-scripts/chimp/Gene.pm
+use strict;
+use warnings;
+
+
+
+#
+# Utility methods for the storing of chimp genes
+#
+
+package Gene;
+
+
+use Bio::EnsEMBL::Utils::Exception qw(info throw);
+
+
+use constant NEAR => 1.5e6;
+
+our %KEEP_XREF = ('SWISSPROT' => 1,
+                  'SPTREMBL'  => 1,
+                  'HUGO'      => 1);
+
+###############################################################################
+# store gene
+#
+# Builds Ensembl genes from the generated chimp transcripts and stores them
+# in the database.
+#
+###############################################################################
+
+sub store_gene {
+  my $db = shift;
+  my $hum_gene = shift; # human gene
+  my $ctranscripts = shift; # chimp transcripts
+
+  my $MIN_AA_LEN = 15;
+  my $MIN_NT_LEN = 600;
+
+  my $analysis = $db->get_AnalysisAdaptor->fetch_by_logic_name('ensembl');
+
+  # Look at the translations and convert any transcripts with stop codons
+  # into pseudogenes
+  foreach my $ct (@$ctranscripts) {
+    if($ct->translation && $ct->translate->seq() =~ /\*/) {
+      $ct->translation(undef);
+    }
+  }
+
+  # transfer xrefs from human transcripts/translations
+  transfer_xrefs($hum_gene, $ctranscripts);
+
+  # compact duplicate transcripts into the same transcripts
+  $ctranscripts = compact_transcripts($ctranscripts);
+
+  # group all close together transcripts on the same strand into clusters
+  my $clusters = cluster_transcripts($ctranscripts);
+
+  my $gene_adaptor = $db->get_GeneAdaptor();
+
+  foreach my $cluster (@$clusters) {
+
+    # keep genes only if there is a minimum amount of nucleotide
+    # OR amino acid sequence in transcripts in the same region
+    if($cluster->{'nt_len'} < $MIN_NT_LEN &&
+       $cluster->{'aa_len'} < $MIN_AA_LEN) {
+      next;
+    }
+
+    # one gene for each cluster
+    my $cgene = Bio::EnsEMBL::Gene->new();
+
+    generate_stable_id($cgene);
+
+    # rename transcripts and add to gene
+    foreach my $ctrans (@{$cluster->{'transcripts'}}) {
+      generate_stable_id($ctrans);
+
+      # rename translation
+      if($ctrans->translation) {
+        generate_stable_id($ctrans->translation);
+      }
+
+      $cgene->add_Transcript($ctrans);
+    }
+
+    # rename all of the exons
+    # but watch out because duplicate exons will be merged and we do not
+    # want to generate multiple names
+    my %ex_stable_ids;
+    foreach my $ex (@{$cgene->get_all_Exons()}) {
+      if($ex_stable_ids{$ex->hashkey()}) {
+        $ex->stable_id($ex_stable_ids{$ex->hashkey()});
+      } else {
+        generate_stable_id($ex);
+        $ex_stable_ids{$ex->hashkey()} = $ex->stable_id();
+      }
+    }
+
+
+    foreach my $gx (@{$hum_gene->get_all_DBEntries}) {
+      $cgene->add_DBEntry($gx) if($KEEP_XREF{uc($gx->dbname())});
+    }
+
+    if($hum_gene->display_xref &&
+       $KEEP_XREF{uc($hum_gene->display_xref->dbname)}){
+      $cgene->display_xref($hum_gene->display_xref);
+    }
+
+    # set the analysis on the gene object
+    $cgene->analysis($analysis);
+
+    my $name = $cgene->stable_id();
+
+    $name .= '/'.$cgene->display_xref->display_id() if($cgene->display_xref());
+
+    $cgene->type('ensembl');
+
+    # store the bloody thing
+    print STDERR "Storing gene: $name\n";
+    $gene_adaptor->store($cgene);
+  }
+
+  return;
+}
+
+
+
+###############################################################################
+# generate_stable_id
+#
+# Generates a stable_id for a gene, transcript, translation or exon and sets
+# it on the object.
+#
+###############################################################################
+
+
+my ($TRANSCRIPT_NUM, $GENE_NUM, $EXON_NUM, $TRANSLATION_NUM);
+
+
+sub generate_stable_id {
+  my $object = shift;
+
+  my $SPECIES_PREFIX = 'PTR';
+  my $PAD            = 18;
+
+  my $type_prefix;
+  my $num;
+
+  if($object->isa('Bio::EnsEMBL::Exon')) {
+    $type_prefix = 'E';
+    $EXON_NUM       ||= 0;
+    $num = ++$EXON_NUM;
+  } elsif($object->isa('Bio::EnsEMBL::Transcript')) {
+    $type_prefix = 'T';
+    $TRANSCRIPT_NUM ||= 0;
+    $num = ++$TRANSCRIPT_NUM;
+  } elsif($object->isa('Bio::EnsEMBL::Gene')) {
+    $type_prefix = 'G';
+    $GENE_NUM       ||= 0;
+    $num = ++$GENE_NUM;
+  } elsif($object->isa('Bio::EnsEMBL::Translation')) {
+    $type_prefix = 'P';
+    $TRANSLATION_NUM ||= 0;
+    $num = ++$TRANSLATION_NUM;
+  } else {
+    throw('Unknown object type '.ref($object).'. Cannot create stable_id.');
+  }
+
+  my $prefix = "ENS${SPECIES_PREFIX}${type_prefix}";
+
+  my $pad = $PAD - length($prefix) - length($num);
+
+  $object->version(1);
+  $object->stable_id($prefix . ('0'x$pad) . $num);
+}
+
+
+
+
+
+###############################################################################
+# compact_transcripts
+#
+# Given a set of transcripts this function removes duplicate transcripts and
+# returns the unique set
+#
+###############################################################################
+
+sub compact_transcripts {
+  my $transcripts = shift;
+
+  my %unique_hash;
+  my @unique_list;
+
+  info("Compacting transcripts");
+
+  foreach my $transcript (@$transcripts) {
+    my $hashkey = '';
+
+    foreach my $exon (@{$transcript->get_all_Exons}) {
+      $hashkey .= '('.$exon->hashkey.')';
+    }
+
+    $unique_hash{$hashkey} ||= [];
+    push @{$unique_hash{$hashkey}}, $transcript;
+  }
+
+  # merge xrefs from duplicated transcripts
+
+  foreach my $key (keys %unique_hash) {
+    # choose one of the duplicates arbitrarily
+    my $duplicates = $unique_hash{$key};
+    my $transcript = pop(@$duplicates);
+
+    # merge all of the xrefs and add them to the one transcript that will
+    # be kept
+    merge_xrefs($transcript, $duplicates);
+    push @unique_list, $transcript;
+  }
+
+  return \@unique_list;
+}
+
+
+
+sub merge_xrefs {
+  my $kept_transcript = shift;
+  my $duplicates      = shift;
+
+  return if(!@$duplicates);
+
+  info('Merging xrefs from duplicate transcripts');
+
+  # construct a set of xrefs the transcript to keep already has
+
+  my %existing_tl_xrefs;
+  my %existing_tr_xrefs;
+  foreach my $xref (@{$kept_transcript->get_all_DBEntries()}) {
+    $existing_tr_xrefs{$xref->dbname().':'.$xref->primary_id()} = 1;
+  }
+
+  if($kept_transcript->translation()) {
+    foreach my $xref (@{$kept_transcript->translation->get_all_DBEntries()}) {
+      $existing_tl_xrefs{$xref->dbname().':'.$xref->primary_id()} = 1;
+    }
+  }
+
+  # collect a list of additional xrefs which should be kept
+
+  my %tl_xrefs;
+  my %tr_xrefs;
+  foreach my $dup (@$duplicates) {
+    foreach my $xref (@{$dup->get_all_DBEntries()}) {
+      my $key = $xref->dbname().':'.$xref->primary_id();
+      $tr_xrefs{$key} = $xref if(!$existing_tr_xrefs{$key});
+    }
+
+    if($dup->translation()) {
+      foreach my $xref (@{$dup->translation->get_all_DBEntries()}) {
+        my $key = $xref->dbname().':'.$xref->primary_id();
+        $tl_xrefs{$key} = $xref if(!$existing_tl_xrefs{$key});
+      }
+    }
+  }
+
+  # add any new xrefs which were found to the kept translation
+
+  foreach my $xref (values %tr_xrefs) {
+    $kept_transcript->add_DBEntry($xref);
+  }
+
+  my @new_tl_xrefs = values(%tl_xrefs);
+  my $tl = $kept_transcript->translation();
+
+  if(@new_tl_xrefs && !$tl) {
+    throw("Some duplicate transcripts have translations, and others do not?");
+  }
+
+  foreach my $xref (@new_tl_xrefs) {
+    $tl->add_DBEntry($xref);
+  }
+
+  return;
+}
+
+
+###############################################################################
+# cluster_transcripts
+#
+# Given a set of transcripts this function will cluster them into groups
+# which are near to each other. Note this may be more efficient to use
+# a sorted list, but for the small size of the clusters it should not matter.
+#
+##############################################################################
+
+sub cluster_transcripts {
+  my $transcripts = shift;
+
+  my @clusters;
+
+  info("Clustering transcripts");
+
+  foreach my $tr (@$transcripts) {
+    my $cl = undef;
+
+    foreach my $c (@clusters) {
+      if(is_near($tr->start(),$tr->end(),$tr->strand(),
+                 $c->{'start'}, $c->{'end'}, $c->{'strand'})) {
+
+        $cl = $c;
+      }
+    }
+
+    if($cl) {
+      push @{$cl->{'transcripts'}}, $tr;
+
+      $cl->{'end'} =
+        ( $tr->end > $cl->{'end'} )     ? $tr->{'end'}   : $cl->{'end'};
+      $cl->{'start'} =
+        ( $tr->start < $cl->{'start'} ) ? $tr->{'start'} : $cl->{'start'};
+
+    } else {
+      $cl = {'start'  => $tr->start(),
+             'end'    => $tr->end(),
+             'strand' => $tr->strand(),
+             'transcripts' => [$tr]};
+
+      push @clusters, $cl;
+    }
+  }
+
+  # now cluster clusters
+
+  for(my $i = 0; $i < @clusters; $i++) {
+    for(my $j = $i+1; $j < @clusters; $j++) {
+      my $c1 = $clusters[$i];
+      my $c2 = $clusters[$j];
+      if(is_near($c1->{'start'}, $c1->{'end'}, $c1->{'strand'},
+                 $c2->{'start'}, $c2->{'end'}, $c2->{'strand'})) {
+        # merge clusters and take one of them out of the list
+        splice(@clusters, $j, 1);
+
+        $c1->{'start'} =
+          ($c1->{'start'} < $c2->{'start'}) ? $c1->{'start'} : $c2->{'start'};
+
+        $c1->{'end'} =
+          ($c1->{'end'} > $c2->{'end'}) ? $c1->{'end'} : $c2->{'end'};
+
+        push @{$c1->{'transcripts'}}, @{$c2->{'transcripts'}};
+
+        # start over again
+        $i = -1;
+        $j = -1;
+      }
+    }
+  }
+
+  # sum the nucleotide length and amino acid lengths of each of the
+  # transcripts in a given cluster
+  foreach my $cl (@clusters) {
+    $cl->{'nt_len'} = 0;
+    $cl->{'aa_len'} = 0;
+
+    foreach my $tr (@{$cl->{'transcripts'}}) {
+      $cl->{'nt_len'} += length($tr->spliced_seq());
+      if($tr->translation) {
+        $cl->{'aa_len'} += length($tr->translate->seq());
+      }
+    }
+  }
+
+
+  return \@clusters;
+}
+
+
+
+sub is_near {
+  my ($start1,$end1,$strand1, $start2, $end2, $strand2) = @_;
+
+  # cannot be clustered if not on same strand
+  if($strand1 != $strand2) {
+    return 0;
+  }
+
+  # check if the regions overlap
+  if($end1 >= $start2 && $start1 <= $end2) {
+    return 1;
+  }
+
+  if($start1 > $end2) {
+    return (($start1 - $end2) < NEAR) ? 1 : 0;
+  }
+
+  return (($start2 - $end1) < NEAR) ? 1 : 0;
+}
+
+
+
+
+
+
+sub transfer_xrefs {
+  my $hum_gene = shift;
+  my $chimp_transcripts = shift;
+
+  my %chimp_transcripts;
+  my %chimp_translations;
+
+  foreach my $tr (@$chimp_transcripts) {
+    $chimp_transcripts{$tr->stable_id()} ||= [];
+    push @{$chimp_transcripts{$tr->stable_id()}}, $tr;
+
+    my $tl = $tr->translation();
+
+    if($tl) {
+      $chimp_translations{$tl->stable_id()} ||= [];
+      push @{$chimp_translations{$tl->stable_id()}}, $tl;
+    }
+  }
+
+  foreach my $tr (@{$hum_gene->get_all_Transcripts()}) {
+    foreach my $chimp_tr (@{$chimp_transcripts{$tr->stable_id}}) {
+      foreach my $xref (@{$tr->get_all_DBEntries}) {
+        $chimp_tr->add_DBEntry($xref) if($KEEP_XREF{uc($xref->dbname())});
+      }
+
+      if($tr->display_xref() && $KEEP_XREF{uc($tr->display_xref->dbname)}) {
+        $chimp_tr->display_xref($tr->display_xref);
+      }
+    }
+
+    my $tl = $tr->translation();
+    if($tl) {
+      foreach my $xref (@{$tl->get_all_DBEntries}) {
+        foreach my $chimp_tl (@{$chimp_translations{$tl->stable_id()}}) {
+          $chimp_tl->add_DBEntry($xref) if($KEEP_XREF{uc($xref->dbname())});
+        }
+      }
+    }
+  }
+
+  return;
+}
+
+1;
--- a/misc-scripts/chimp/Transcript.pm
+++ b/misc-scripts/chimp/Transcript.pm
@@ -18,6 +18,8 @@ use Bio::EnsEMBL::Exon;
 use Bio::EnsEMBL::Utils::Exception qw(throw info);


+use constant MAX_INTRON_LEN => 2e6;
+

 #
 # sanity checks the interim exons, and splits this
@@ -115,9 +117,6 @@ sub check_iexons {
      return check_iexons($itranscript, $itranscript_array);
    }

-    $prev_end = $iexon->end();
-    $prev_start = $iexon->start();
-
    if (!defined($transcript_strand)) {
      $transcript_strand = $iexon->strand();
    } elsif ($transcript_strand != $iexon->strand()) {
@@ -134,6 +133,31 @@ sub check_iexons {
      }
      return check_iexons($itranscript, $itranscript_array);
    }
+
+    # watch out for extremely long introns
+    my $intron_len = 0;
+
+    if(defined($prev_start)) {
+      if($iexon->strand() == 1) {
+        $intron_len = $iexon->start - $prev_end + 1;
+      } else {
+        $intron_len = $prev_start - $iexon->end + 1;
+      }
+    }
+
+    if($intron_len > MAX_INTRON_LEN) {
+      info("  very long intron, splitting transcripts");
+      my $keep_exon = 1;
+      my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
+
+      if($first_transcript) {
+        push @$itranscript_array, $first_transcript;
+      }
+      return check_iexons($itranscript, $itranscript_array);
+    }
+
+    $prev_end = $iexon->end();
+    $prev_start = $iexon->start();
  }

  $itranscript_array ||= [];

--- a/misc-scripts/chimp/Utils.pm
+++ b/misc-scripts/chimp/Utils.pm
@@ -10,7 +10,7 @@ use vars qw(@ISA @EXPORT_OK);

 @ISA = qw(Exporter);

-@EXPORT_OK = qw(print_exon print_coords print_translation);
+@EXPORT_OK = qw(print_exon print_coords print_translation print_three_phase_translation);



@@ -109,4 +109,24 @@ sub print_translation {
 }


+sub print_three_phase_translation {
+  my $transcript = shift;
+
+  return if(!$transcript->translation());
+
+  my $orig_phase = $transcript->start_Exon->phase();
+
+  foreach my $phase (0,1,2) {
+    info("======== Phase $phase translation: ");
+    $transcript->start_Exon->phase($phase);
+    info("Peptide: " . $transcript->translate->seq() . "\n\n===============");
+  }
+
+  $transcript->start_Exon->phase($orig_phase);
+
+  return;
+}
+
+
+
 1;
--- a/misc-scripts/chimp/human2chimp.pl
+++ b/misc-scripts/chimp/human2chimp.pl
@@ -13,6 +13,7 @@ use StatMsg;
 use Deletion;
 use Insertion;
 use Transcript;
+use Gene;
 use StatLogger;
 use StatMsg;

@@ -152,6 +153,8 @@ use Bio::EnsEMBL::Utils::Exception qw(throw info verbose warning);
      info("Gene: ".$gene->stable_id);
      my $transcripts = $gene->get_all_Transcripts();

+      my @finished;
+
      foreach my $transcript (@$transcripts) {
        next if(!$transcript->translation); #skip pseudo genes

@@ -162,98 +165,102 @@ use Bio::EnsEMBL::Utils::Exception qw(throw info verbose warning);
        my $finished_transcripts =
          create_transcripts($interim_transcript, $slice_adaptor);

-        my $transcript_count = @$finished_transcripts;
-        my $translation_count = 0;
-        my $stop_codons_count = 0;

-        if($transcript_count > 1) {
-          StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::SPLIT);
-        }
-        elsif($transcript_count== 0) {
-          StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::NO_SEQUENCE_LEFT);
+        # set the translation stable identifier on the finished transcripts
+        foreach my $tr (@$finished_transcripts) {
+          if($tr->translation() && $transcript->translation()) {
+            $tr->translation->stable_id($transcript->translation->stable_id);
+          }
        }

-        foreach my $ftrans (@$finished_transcripts) {
-          if($ftrans->translation()) {
-            $translation_count++;
-            my $pep = $ftrans->translate->seq();
+        # This method call is optional, just for statistics gathering
+        # and extra sanity checking / debug output:
+        gen_transcript_stats($finished_transcripts);

-            print STDERR "\n\n$pep\n\n";
+        push @finished, @$finished_transcripts;
+      }

-            if($pep =~ /\*/) {
-              $stop_codons_count++;
-            }
+      if($store) {
+        Gene::store_gene($dest_db, $gene, \@finished);
+      }
+    }
+  }
+}

-            # sanity check, if translation is defined we expect a peptide
-            if(!$pep) {
-              print_translation($ftrans->translation());
-              throw("Unexpected Translation but no peptide");
-            }
-          } else {
-            print STDERR "NO TRANSLATION LEFT\n";
-          }
-        }

-        # If there were stop codons in one of the split transcripts
-        # report it. Report it as 'entire' if all split transcripts had
-        # stops.
-        if($stop_codons_count) {
-          my $code = StatMsg::TRANSCRIPT | StatMsg::DOESNT_TRANSLATE;
-          if($stop_codons_count == $translation_count) {
-            $code |= StatMsg::ENTIRE;
-          } else {
-            $code |= StatMsg::PARTIAL;
-          }
-          StatMsg->new($code);
-        }

-        if(!$translation_count) {
-          StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::NO_CDS_LEFT);
-        }
+#
+# method which does some sanity checking of generated transcripts,
+# gathers some stats about the completed transcripts and prints the
+# created translations to STDERR
+#
+sub gen_transcript_stats {
+  my $finished_transcripts = shift;

-        if($translation_count) {
-          if($stop_codons_count) {
-            if($translation_count > $stop_codons_count) {
-              StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::TRANSLATES |
-                          StatMsg::PARTIAL);
-            }
-          } else {
-            StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::TRANSLATES |
-                         StatMsg::ENTIRE);
-          }
-        }
+  my $transcript_count = @$finished_transcripts;
+  my $translation_count = 0;
+  my $stop_codons_count = 0;

-#         foreach my $ftr (@$finished_transcripts) {
-#           print_three_phase_translation($ftr);
-#         }
+  if($transcript_count > 1) {
+    StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::SPLIT);
+  }
+  elsif($transcript_count== 0) {
+    StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::NO_SEQUENCE_LEFT);
+  }

-        if($store) {
-          store_gene($dest_db, $gene, $finished_transcripts);
-        }
+  foreach my $ftrans (@$finished_transcripts) {
+    if($ftrans->translation()) {
+      $translation_count++;
+      my $pep = $ftrans->translate->seq();
+
+      print STDERR "\n\n$pep\n\n";
+
+      if($pep =~ /\*/) {
+        $stop_codons_count++;
+      }

+      # sanity check, if translation is defined we expect a peptide
+      if(!$pep) {
+        print_translation($ftrans->translation());
+        throw("Unexpected Translation but no peptide");
      }
+    } else {
+      print STDERR "NO TRANSLATION LEFT\n";
    }
  }
-}
-
-
-sub print_three_phase_translation {
-  my $transcript = shift;

-  return if(!$transcript->translation());
+  # If there were stop codons in one of the split transcripts
+  # report it. Report it as 'entire' if all split transcripts had
+  # stops.
+  if($stop_codons_count) {
+    my $code = StatMsg::TRANSCRIPT | StatMsg::DOESNT_TRANSLATE;
+    if($stop_codons_count == $translation_count) {
+      $code |= StatMsg::ENTIRE;
+    } else {
+      $code |= StatMsg::PARTIAL;
+    }
+    StatMsg->new($code);
+  }

-  my $orig_phase = $transcript->start_Exon->phase();
+  if(!$translation_count) {
+    StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::NO_CDS_LEFT);
+  }

-  foreach my $phase (0,1,2) {
-    info("======== Phase $phase translation: ");
-    $transcript->start_Exon->phase($phase);
-    info("Peptide: " . $transcript->translate->seq() . "\n\n===============");
+  if($translation_count) {
+    if($stop_codons_count) {
+      if($translation_count > $stop_codons_count) {
+        StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::TRANSLATES |
+                     StatMsg::PARTIAL);
+      }
+    } else {
+      StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::TRANSLATES |
+                   StatMsg::ENTIRE);
+    }
  }
+}
+

-  $transcript->start_Exon->phase($orig_phase);

-  return;
-}



@@ -272,11 +279,6 @@ sub transfer_transcript {

  my $human_exons = $transcript->get_all_Exons();

-  if (!$transcript->translation()) { # watch out for pseudogenes
-    info("pseudogene - discarding");
-    return;
-  }
-
  my $chimp_cdna_pos = 0;
  my $cdna_exon_start = 1;

@@ -403,25 +405,20 @@ sub transfer_transcript {
                # insert in chimp, deletion in human
                #

-                #info("before insert, CDNA_POS= $chimp_cdna_pos");
-
                Insertion::process_insert(\$chimp_cdna_pos, $insert_len,
                                          $chimp_exon, $chimp_transcript);

                $chimp_cdna_pos += $insert_len;
-                #info("after insert, CDNA_POS= $chimp_cdna_pos");
              }
            }

            $chimp_cdna_pos += $c->length();
-            #info("after match (" . $c->length(). ") CDNA_POS= $chimp_cdna_pos");
          }
        }  # foreach coord
      }
    }

    $cdna_exon_start = $chimp_cdna_pos + 1;
-    # info("after exon, CDNA_POS= $chimp_cdna_pos");

    $chimp_transcript->add_Exon($chimp_exon);
  } # foreach exon
@@ -555,177 +552,6 @@ sub create_transcripts {



-###############################################################################
-# store gene
-#
-# Builds Ensembl genes from the generated chimp transcripts and stores them
-# in the database.
-#
-###############################################################################
-
-
-sub store_gene {
-  my $db = shift;
-  my $hum_gene = shift; # human gene
-  my $ctranscripts = shift; # chimp transcripts
-
-  my $MIN_AA_LEN = 15;
-  my $MIN_NT_LEN = 600;
-
-  my $analysis = $db->get_AnalysisAdaptor->fetch_by_logic_name('ensembl');
-
-  # Look at the translations and convert any transcripts with stop codons
-  # into pseudogenes
-  foreach my $ct (@$ctranscripts) {
-    if($ct->translation && $ct->translate->seq() =~ /\*/) {
-      $ct->translation(undef);
-    }
-  }
-
-
-  # Group transcripts by their strand and scaffold.  We
-  # cannot really build genes that spand scaffolds or strands
-  my (%ctrans_hash, %nt_lens, %aa_lens);
-
-  foreach my $ct (@$ctranscripts) {
-    my $region = $ct->slice->seq_region_name() . ':' . $ct->strand();
-
-    $ctrans_hash{$region} ||= [];
-    push @{$ctrans_hash{$region}}, $ct;
-
-    # keep track of how many nucleotides and amino acids are in the
-    # transcripts from this gene that made it to this area.  If there
-    # are not many, the transcript should probably be rejected.
-    my $nt_len = length($ct->spliced_seq()) || 0;
-    my $aa_len = ($ct->translation()) ? length($ct->translate->seq()) : 0;
-
-    $nt_lens{$region} ||= 0;
-    $nt_lens{$region} += $nt_len;
-
-    $aa_lens{$region} ||= 0;
-    $aa_lens{$region} += $aa_len;
-  }
-
-  my %chimp_genes;
-
-  my $gene_adaptor = $db->get_GeneAdaptor();
-
-  foreach my $region (keys %ctrans_hash) {
-    # keep transcripts if there is a minimum amount of nucleotide
-    # OR amino acid sequence in transcripts in the same region
-    next if($nt_lens{$region}<$MIN_NT_LEN && $aa_lens{$region}<$MIN_AA_LEN);
-
-    # one gene for each region
-    my $cgene = $chimp_genes{$region} ||= Bio::EnsEMBL::Gene->new();
-
-    generate_stable_id($cgene);
-
-    # rename transcripts and add to gene
-    foreach my $ctrans (@{$ctrans_hash{$region}}) {
-      generate_stable_id($ctrans);
-
-      # rename translation
-      if($ctrans->translation) {
-        generate_stable_id($ctrans->translation);
-      }
-
-      $cgene->add_Transcript($ctrans);
-    }
-
-    # rename all of the exons
-    # but watch out because duplicate exons will be merged and we do not
-    # want to generate multiple names
-    my %ex_stable_ids;
-    foreach my $ex (@{$cgene->get_all_Exons()}) {
-      if($ex_stable_ids{$ex->hashkey()}) {
-        $ex->stable_id($ex_stable_ids{$ex->hashkey()});
-      } else {
-        generate_stable_id($ex);
-        $ex_stable_ids{$ex->hashkey()} = $ex->stable_id();
-      }
-    }
-
-    # set the analysis on the gene object
-    $cgene->analysis($analysis);
-
-
-    # for now just grab all HUGO xrefs, and take last one as display xref;
-    my $display_xref;
-    foreach my $gx (@{$hum_gene->get_all_DBLinks()}) {
-      if(uc($gx->dbname()) eq 'HUGO') {
-        $cgene->add_DBEntry($gx);
-        $display_xref = $gx;
-      }
-    }
-
-    $cgene->display_xref($display_xref) if($display_xref);
-
-    my $name = $cgene->stable_id();
-
-    $name .= '/'.$display_xref->display_id() if($display_xref);
-
-    $cgene->type('ensembl');
-
-    # store the bloody thing
-    print STDERR "Storing gene: $name\n";
-    $gene_adaptor->store($cgene);
-  }
-
-  return;
-}
-
-
-
-###############################################################################
-# generate_stable_id
-#
-# Generates a stable_id for a gene, transcript, translation or exon and sets
-# it on the object.
-#
-###############################################################################
-
-
-my ($TRANSCRIPT_NUM, $GENE_NUM, $EXON_NUM, $TRANSLATION_NUM);
-
-
-sub generate_stable_id {
-  my $object = shift;
-
-  my $SPECIES_PREFIX = 'PTR';
-  my $PAD            = 18;
-
-  my $type_prefix;
-  my $num;
-
-  if($object->isa('Bio::EnsEMBL::Exon')) {
-    $type_prefix = 'E';
-    $EXON_NUM       ||= 0;
-    $num = ++$EXON_NUM;
-  } elsif($object->isa('Bio::EnsEMBL::Transcript')) {
-    $type_prefix = 'T';
-    $TRANSCRIPT_NUM ||= 0;
-    $num = ++$TRANSCRIPT_NUM;
-  } elsif($object->isa('Bio::EnsEMBL::Gene')) {
-    $type_prefix = 'G';
-    $GENE_NUM       ||= 0;
-    $num = ++$GENE_NUM;
-  } elsif($object->isa('Bio::EnsEMBL::Translation')) {
-    $type_prefix = 'P';
-    $TRANSLATION_NUM ||= 0;
-    $num = ++$TRANSLATION_NUM;
-  } else {
-    throw('Unknown object type '.ref($object).'. Cannot create stable_id.');
-  }
-
-  my $prefix = "ENS${SPECIES_PREFIX}${type_prefix}";
-
-  my $pad = $PAD - length($prefix) - length($num);
-
-  $object->version(1);
-  $object->stable_id($prefix . ('0'x$pad) . $num);
-}
-
-