*** empty log message ***

* empty log message *
c9f32319 · Graham McVicker · f6753cbf · c9f32319
Commit c9f32319 authored 21 years ago by Graham McVicker
--- a/misc-scripts/chimp/human2chimp.pl
+++ b/misc-scripts/chimp/human2chimp.pl
@@ -4,6 +4,8 @@ use warnings;
 use Getopt::Long;

 use Bio::EnsEMBL::DBSQL::DBAdaptor;
+use Bio::EnsEMBL::Transcript;
+use Bio::EnsEMBL::Exon;


 my $MAX_CODING_INDEL     = 12; #max allowed coding indel in exon
@@ -75,7 +77,7 @@ foreach my $slice (@$slices) {

  my $genes = $gene_adaptor->fetch_all_by_Slice($slice);

-  foreach my $gene (@$genes) {
+  foreach my $gene (reverse @$genes) {
    my $transcripts = $gene->get_all_Transcripts();

    foreach my $transcript (@$transcripts) {
@@ -105,11 +107,24 @@ sub transfer_transcript {
  my $slice_adaptor = $chimp_db->get_SliceAdaptor();

  my $exons = $transcript->get_all_Exons();
+  my $translation = $transcript->translation();

-  my @chimp_exons;
+  my @chimp_transcripts;
+
+  my $chimp_transcript = Bio::EnsEMBL::Transcript->new();
+  my $chimp_translation;
+
+  if($translation) { #watch out for pseudogenes
+    $chimp_translation = Bio::EnsEMBL::Translation->new();
+    $chimp_translation->start($translation->start());
+    $chimp_translation->end($translation->end());
+  }
+
+  my $chimp_exon = undef;

 EXON:
  foreach my $exon (@$exons) {
+
    my @coords = $mapper->map($exon->seq_region_name, $exon->seq_region_start,
                              $exon->seq_region_end, $exon->seq_region_strand,
                              $human_cs);
@@ -127,13 +142,13 @@ sub transfer_transcript {
           $cds_coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) {
          debug('entire utr exon deletion - ok');

-          # TBD
+          # we can simply throw out this exon, it was all UTR

          next EXON;
        } else {
          ### TBD: can do more sensible checking and maybe split transcript
          debug('entire coding exon deletion - discarding transcript');
-          return ();
+          return @chimp_transcripts;
        }
      } else {
        #exon mapped completely
@@ -146,22 +161,50 @@ sub transfer_transcript {
                                                    undef,undef,undef,
                                                    $chimp_cs->version);

-        push @chimp_exons, Bio::EnsEMBL::Exon->new
-          (-START     => $c->start,
-           -END       => $c->end,
-           -PHASE     => $exon->phase,
-           -END_PHASE => $exon->end_phase,
-           -ANALYSIS  => $exon->analysis,
-           -SLICE     => $slice);
+
+        ### TBD handle problems with seq_region/strand jumping exons...
+
+        $chimp_transcript->add_Exon( Bio::EnsEMBL::Exon->new
+                                     ( -START     => $c->start,
+                                       -END       => $c->end,
+                                       -STRAND    => $c->strand,
+                                       -PHASE     => $exon->phase,
+                                       -END_PHASE => $exon->end_phase,
+                                       -ANALYSIS  => $exon->analysis,
+                                       -SLICE     => $slice) );
      }
    } else {
      my $num = scalar(@coords);

      my $exon_position = 0;

+      my $chimp_exon = Bio::EnsEMBL::Exon->new();
+      my $min_start = undef;
+      my $max_end   = undef;
+      my $seq_region = undef;
+
+      my $is_start_exon = 0;
+      my $is_end_exon   = 0;
+
+      # Fix translation start/end if it needs adjusting here
+      if($translation) {
+
+        if($translation->start_Exon()->hashkey() eq $exon->hashkey()) {
+          $is_start_exon = 1;
+          $chimp_translation->start_Exon($chimp_exon);
+        }
+
+        if($translation->end_Exon()->hashkey eq $exon->hashkey()) {
+          $is_end_exon = 1;
+          $chimp_translation->end_Exon($chimp_exon);
+        }
+      }
+
+
    COORD:
      for(my $i=0; $i < $num; $i++) {
        my $c = $coords[$i];
+
        $exon_position += $c->length();

        if($c->isa('Bio::EnsEMBL::Mapper::Gap')) {
@@ -169,81 +212,250 @@ sub transfer_transcript {
          # this is a DELETION
          #

-          # convert to CDS coords
+          # convert to CDS coords and categorize them
+          # gaps are UTR deletions, coords are CDS deletions
+          #
          my @cds_coords = $trans_mapper->genomic2cds($c->start(),$c->end(),
                                                      $exon->strand());

-          @cds_coords = 
-            grep {!$_->isa('Bio::EnsEMBL::Mapper::Gap')} @cds_coords;
+          my $cds_delete;
+          my @utr_deletes;
+          foreach my $cds_coord (@cds_coords) {
+            if($cds_coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
+              push @utr_deletes, $cds_coord;
+            } else {
+              #sanity check:
+              if($cds_delete) {
+                throw("Unexpected: multiple cds deletes for single exon.");
+              }
+              $cds_delete = $cds_coord;
+            }
+          }
+
+
+          if(@utr_deletes > 2) {
+            throw("Unexpected: >2 UTR deletes for single exon.");
+          }

-          if(@cds_coords == 0) {
-            #this deletion was entirely in the UTR, accept it

+          foreach my $utr_delete (@utr_deletes) {
+            # the UTR deletions may require adjustments to the translation
            debug("utr deletion - ok");

-            # TBD fix translation start/end if it needs adjusting here
+            # Fix translation start/end if it needs adjusting here
+            if($is_start_exon) {

-            next COORD;
-          }
+              #need to adjust the start of translation if deletion was
+              #in 5prime UTR
+              if($exon_position < $translation->start()) {
+                $chimp_translation->start($chimp_translation->start() -
+                                          $utr_delete->length());
+
+                #also need to adjust end translation if this is also the
+                #end exon
+                if($is_end_exon) {
+                  $chimp_translation->end($chimp_translation->end() -
+                                          $utr_delete->length());

-          if(@cds_coords != 1) {
-            # it should not be possible to get more than one cds coord
-            # back since we are dealing with a single exon
-            throw("Unexpected, got multiple cds coords for single exon");
+                }
+              }
+            }
          }

-          my $cds_coord = $cds_coords[0];
+          if($cds_delete) {
+            # if this is a long deletion of coding sequence, then we should
+            # throw out this transcript

-          # if this is a long deletion of coding sequence, then we should
-          # throw out this transcript
+            my $del_len = $cds_delete->length();
+            my $cds_len = get_cds_len($transcript);

-          my $del_len = $cds_coord->length();
+            if($cds_len == $del_len) {
+              debug("entire cds deletion - discarding transcript");
+              return @chimp_transcripts;
+            }

-          if($del_len > $MAX_CODING_INDEL) {
-            debug("long cds deletion ($del_len) - discarding transcript");
+            if($del_len > $MAX_CODING_INDEL) {
+              debug("long cds deletion ($del_len) - discarding transcript");

-            # TBD split transcript in two parts and keep both halves
+              # TBD split transcript in two parts and keep both halves

-            return ();
-          }
+              return @chimp_transcripts;
+            }

-          # if this deletion is short and %3 == 0, then we do not need to
-          # do much, but may have to adjust coding start/end of translation
-          if($del_len % 3 == 0) {
-            debug("short ($del_len) in frame cds deletion - keeping exon");
+            #adjust the translation end accordingly
+            if($is_end_exon) {
+              #adjust translation end accordingly
+              $chimp_translation->end($chimp_translation->end - $del_len);
+            }

-            #TBD
+            my $frame_shift = $del_len %3;

-            next COORD;
-          }
+            # if this deletion is short and %3 == 0, then we do not need to
+            # do much, but may have to adjust coding end of translation
+            if($frame_shift == 0) {
+              debug("short ($del_len) in frame cds deletion - ok");

-          if($del_len > $MAX_FRAMESHIFT_INDEL) {
-            debug("medium ($del_len) frameshifting cds deletion - " .
-                  "discarding transcript");
+              if($is_end_exon) {
+                $chimp_translation->end($chimp_translation->end() - $del_len);
+              }

-            #TBD split transcript in two parts and keep both halves
+              next COORD;
+            }

-            return ();
-          }
+            if($del_len > $MAX_FRAMESHIFT_INDEL) {
+              debug("medium ($del_len) frameshifting cds deletion - " .
+                    "discarding transcript");
+
+              #TBD split transcript in two parts and keep both halves
+
+              return @chimp_transcripts;
+            }
+
+            # this is a short frameshifting deletion
+
+            # check if deletion is at beginning of CDS
+            if($cds_delete->start() == 1) {
+              # we need to adjust the start of the exon now to
+              # put everything back into frame
+
+              #sanity check:
+              if(!$is_start_exon) {
+                throw("Unexpected: start of cds is not in start exon");
+              }

-          # this is a short frameshifting deletion
-          # create a 'frameshift intron' and move on
-          debug("short ($del_len) frameshifting cds deletion - " .
-                "creating fake intron");
+              my $ex_start = $chimp_exon->start();

-          # if this exon the first coding exon, then we need to adjust the
-          # coding start of the translation, and possibly the exon start
-          # as well
+              #if there is already UTR, just shift it
+              if($chimp_translation->start > 1) {
+                debug("frameshift deletion at start of cds - extending utr");
+                $chimp_translation->start($chimp_translation->start +
+                                          3 - $frame_shift);
+              } else {
+                $ex_start = $chimp_exon->start();

-          # if this exon is the last coding exon, then we need to adjust the
-          # coding end
+                #there was no UTR so adjust exon start, leave w/ no UTR
+                if(!defined($ex_start)) {
+                  #sanity check:
+                  if(@coords < $i+2) {
+                    throw("Unexpected: no exon start defined and " .
+                          "no more coords available");
+                  }

+                  $ex_start = $coords[$i+1]->start(); #peek at next coord
+                }
+
+                debug("frameshift deletion at start of cds - " .
+                      "moving exon start");
+
+                $chimp_exon->start($ex_start + 3 - $frame_shift);
+              }
+
+              next COORD;
+            }
+
+            # check if deletion is at end of CDS
+            if($cds_coord->end() == get_cds_len($transcript)) {
+              # sanity check:
+              if(!$is_end_exon) {
+                throw("Unexpected: end of cds is not in end exon");
+              }
+
+              # if there was already a UTR just shift it
+              if($translation->end() < $exon->length()) {
+                $chimp_translation->end($chimp_translation->end() 
+                                        - 3 + $frame_shift);
+              } else {
+                # no utr and at end of CDS, so must be end of exon
+                # sanity checks:
+                if(!defined($chimp_exon->end())) {
+                  throw("Unexpected: at end of exon and no end defined");
+                }
+
+                debug("frameshift deletion at end of cds - " .
+                      "moving exon end and translation end");
+                $chimp_exon->end($chimp_exon->end() - 3 + $frame_shift);
+                $chimp_translation->end($chimp_exon->length());
+              }
+
+              next COORD;
+            }
+
+            # the deletion was not at either end of the cds, but could still
+            # be at the beginning or end of the exon
+
+            # check if this is a deletion at the exon start
+            if($i == 0) {
+              # sanity check:
+              if($is_start_exon) {
+                throw("Unexpected: delete at start of start exon is " .
+                      "not at cds start");
+              }
+
+              debug("frameshift deletion at start of exon - " .
+                    "adjusting start of exon");
+              $chimp_exon->start($chimp_exon->start + 3 - $frameshift);
+              if($is_end_exon) {
+                debug("adjusting translation end");
+                $chimp_exon->translation_end($chimp_exon_translation->end -
+                                             3 + $frameshift);
+              }
+
+              next COORD;
+            }
+
+            #check if this is a deletion at the exon end
+            if(@coords == $i+1) {
+              # sanity check:
+              if($is_end_exon) {
+                throw("Unexpected delete at end of end exon is " .
+                      "not at cds start");
+              }
+
+              debug("frameshift deletion at end of exon - " .
+                    "adjusting end of exon");
+              $chimp_exon->end($chimp_exon->end - 3 + $frameshift);
+
+              next COORD;
+            }
+
+
+            # now we know that this is a deletion in the middle of the
+            # cds
+
+            # for this to be possible there should already be an exon start/end
+            # because we should have already seen some exon coords
+
+            # sanity check
+            if(!defined($chimp_exon->start())) {
+              throw("Unexpected: mid cds delete and no exon start defined");
+            }
+            if(!defined($chimp_exon->end())) {
+              throw("Unexpected: mid cds delete and no exon end defined");
+            }
+
+            # split this exon in two parts and make a 'frameshift' intron
+            # in the middle
+
+            debug("short ($del_len) frameshifting mid cds deletion - " .
+                  "creating fake intron");
+
+            my $ex_len = $chimp_exon->length();
+
+            $chimp_exon->end($chimp_exon->end() - 3 + $frameshift)
+
+            push @chimp_exons, $chimp_exon;
+
+            $chimp_exon = Bio::EnsEMBL::Exon->new
+              (-START  => $chimp_exon->end() + 4 - $frame_shift,
+               -STRAND => $chimp_exon->strand());
+
+          }

        } else {
          #if first is coord no need to do anything
          next COORD if($i == 0);

-          my $prev_c = $coords[$i];
+          my $prev_c = $coords[$i-1];

          next if($prev_c->isa('Bio::EnsEMBL::Mapper::Gap'));

@@ -256,7 +468,7 @@ sub transfer_transcript {

            #TBD it may not be necessary to discard transcript if this is UTR

-            return ();
+            return @chimp_transcripts;
          }

          if($c->strand() != $prev_c->strand()) {
@@ -264,7 +476,7 @@ sub transfer_transcript {

            #TBD it may not be necessary to discard transcript if this is UTR

-            return ();
+            return @chimp_transcripts;
          }


@@ -294,6 +506,14 @@ sub transfer_transcript {
              $trans_mapper->genomic2cds($left_coord, $right_coord, -1);
          }

+          if($insert_len < 1) {
+            debug("inverted exon - discarding transcript");
+
+            #TBD - is there something sensible that can be done here?
+
+            return @chimp_transcripts;
+          }
+

          if(@cds_coords == 1 && 
             $cds_coords[0]->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
@@ -305,7 +525,7 @@ sub transfer_transcript {

              # TBD may be ok to keep transcript and split in two?

-              return ();
+              return @chimp_transcripts;
            }

            if($insert_len % 3 == 0) {
@@ -323,7 +543,7 @@ sub transfer_transcript {

              #TBD may be ok to keep transcript and split in two

-              return ();
+              return @chimp_transcripts;
            }

            debug("short ($insert_len) frameshifting cds insert " .
@@ -358,11 +578,24 @@ sub transfer_transcript {
  #  * exons on different scaffolds
  #  * exons on different strands
  #  * exons in which the splice order changed
+  #  * transcripts which lost all exons
+  # maybe phases should be set here

 }



+sub get_cds_len {
+  my $transcript = shift;
+
+  my $len = 0;
+
+  foreach my $exon (@{$transcript->get_all_translateable_Exons()}) {
+    $len += $exon->length();
+  }
+
+  return $len;
+}

 sub debug {
  my $msg  = shift;