now correctly replaces all 1-2bp introns with frameshifts

05219a6a · Graham McVicker · 8152d12a · 05219a6a
Commit 05219a6a authored 20 years ago by Graham McVicker
--- a/misc-scripts/surgery/shortintrons2frameshifts.pl
+++ b/misc-scripts/surgery/shortintrons2frameshifts.pl
@@ -6,13 +6,14 @@ use Bio::EnsEMBL::DBSQL::DBAdaptor;
 use Getopt::Long;
 use POSIX qw(ceil);

-my ($host, $port, $user, $pass, $dbname);
+my ($host, $port, $user, $pass, $dbname, $testid);

 GetOptions('host=s' => \$host,
           'user=s' => \$user,
           'port=i' => \$port,
           'pass=s' => \$pass,
-           'dbname=s' => \$dbname);
+           'dbname=s' => \$dbname,
+           'testid=i' => \$testid);

 $port ||= 3306;

@@ -51,30 +52,45 @@ my $ins_et_sth = $db->dbc->prepare(qq{INSERT INTO exon_transcript
                                      transcript_id = ?,
                                      rank = ?});

+my $upd_tl_sth = $db->dbc->prepare(qq{UPDATE translation
+                                      SET start_exon_id = ?,
+                                          end_exon_id   = ?,
+                                          seq_start    = ?,
+                                          seq_end      = ?
+                                      WHERE translation_id = ?});
+
+
 my $sth = $db->dbc->prepare(qq{SELECT max(stable_id) FROM exon_stable_id});
 $sth->execute();
 my $ex_stable_id  = $sth->fetchall_arrayref->[0]->[0];
 $ex_stable_id++;
 $sth->finish();

-$sth = $db->dbc()->prepare
-  (qq{SELECT t.gene_id,
-             MIN(IF(e1.seq_region_strand = 1,
-                e2.seq_region_start - e1.seq_region_end - 1,
-                e1.seq_region_start - e2.seq_region_end - 1)) AS intron_len
-      FROM exon e1, exon e2, exon_transcript et1, exon_transcript et2,
-           transcript t
-      WHERE et1.exon_id = e1.exon_id
-      AND et2.exon_id = e2.exon_id
-      AND et1.transcript_id = et2.transcript_id
-      AND et1.rank = et2.rank - 1
-      AND et1.transcript_id = t.transcript_id
-      GROUP BY t.gene_id
-      HAVING intron_len < 3});
+if(!$testid) {
+
+  $sth = $db->dbc()->prepare
+    (qq{SELECT t.gene_id,
+               MIN(IF(e1.seq_region_strand = 1,
+                    e2.seq_region_start - e1.seq_region_end - 1,
+                    e1.seq_region_start - e2.seq_region_end - 1)) AS intron_len
+        FROM exon e1, exon e2, exon_transcript et1, exon_transcript et2,
+             transcript t
+        WHERE et1.exon_id = e1.exon_id
+        AND et2.exon_id = e2.exon_id
+        AND et1.transcript_id = et2.transcript_id
+        AND et1.rank = et2.rank - 1
+        AND et1.transcript_id = t.transcript_id
+        GROUP BY t.gene_id
+        HAVING intron_len < 3});
+
+  $sth->execute();

-$sth->execute();
+}
+
+my @gene_ids =
+  ($testid) ? ($testid) : map{$_->[0]} @{$sth->fetchall_arrayref()};

-my $total_rows = $sth->rows();
+my $total_rows = ($testid) ?  1 : $sth->rows();
 my $cur_row = 0;
 my $last_percent = undef;

@@ -83,7 +99,7 @@ my $ea = $db->get_ExonAdaptor();
 my $aa = $db->get_AttributeAdaptor();

 print STDERR "Merging exons and storing frameshifts\n";
-while(my $array = $sth->fetchrow_arrayref()) {
+foreach my $gene_id (@gene_ids) {

  my $percent = ceil(($cur_row++ / $total_rows) * 100);
  if(($percent % 5) == 0 && 
@@ -92,90 +108,100 @@ while(my $array = $sth->fetchrow_arrayref()) {
    print STDERR "$percent% complete\n";
  }

-  my $g = $ga->fetch_by_dbID($array->[0]);
+  my $g = $ga->fetch_by_dbID($gene_id);

  my %old_exons = map {$_->hashkey() => $_} @{$g->get_all_Exons()};
  my %new_exons = ();

+  my %tl_cdna_starts;
+  my %tl_cdna_ends;
+
  foreach my $tr (@{$g->get_all_Transcripts}) {
+    # keep track of translation cdna coordinates before messing with transcript
+    my ($cdna_coding_start, $cdna_coding_end);
+    if($tr->translation()) {
+      $cdna_coding_start = $tr->cdna_coding_start();
+      $cdna_coding_end   = $tr->cdna_coding_end();
+    }
+
    my @frameshifts;

+    # collect list of frameshifts
    foreach my $intron (@{$tr->get_all_Introns()}) {
      push(@frameshifts, $intron) if($intron->length() < 3);
    }

+    my %seen_evidence;
+    my @exons;
    my $fs = shift(@frameshifts);
-
-    my $merging_exon;
    my $cdna_start = 1;
-    my @exons;
-    my %seen_evidence;
-
-    my @old_exons = @{$tr->get_all_Exons()};
-
-    while(@old_exons) {
-      my $ex = shift(@old_exons);
-      $cdna_start += $ex->length();

-      if($fs && $fs->prev_Exon->stable_id() eq $ex->stable_id()) {
-        $merging_exon = undef;
+    # merge together exons which are seperated by frameshift introns
+    foreach my $ex (@{$tr->get_all_Exons()}) {

-        while($fs && $fs->prev_Exon->stable_id() eq $ex->stable_id()) {
-          # this exon has a frameshift, so merge it with next exon
-          # and any subsequent exons serperated only by frameshifts
+      # was the previous exon seperated from this one by a frameshift intron?
+      if($fs && $fs->next_Exon()->stable_id() eq $ex->stable_id()) {

-          if(!$merging_exon) {
-            $merging_exon = {};
-            %{$merging_exon} = %$ex;
-            bless $merging_exon, ref($ex);
-            push @exons, $merging_exon;
+        # merge this exon with the previous one
+        if($ex->strand() == 1) {
+          $exons[$#exons]->end($ex->end());
+        } else {
+          $exons[$#exons]->start($ex->start());
+        }

-            %seen_evidence = ();
+        # merge supporting evidence
+        foreach my $sf (@{$ex->get_all_supporting_features()}) {
+          if(!$seen_evidence{ref($sf) . ':' . $sf->dbID()}) {
+            $ex->add_supporting_feature($sf);
+            $seen_evidence{ref($sf . ':' . $sf->dbID())} = 1;
          }
+        }

-          if($merging_exon->strand() == 1) {
-            $merging_exon->end($fs->next_Exon()->end());
-          } else {
-            $merging_exon->start($fs->next_Exon()->start());
+        # store frameshift as transcript attrib
+        my $seqed = Bio::EnsEMBL::SeqEdit->new
+          (-CODE    => '_rna_edit',
+           -NAME    => 'RNA Edit',
+           -DESC    => 'Post transcriptional RNA edit',
+           -START   => $cdna_start,
+           -END     => $cdna_start + $fs->length() - 1,
+           -ALT_SEQ => '');
+        $aa->store_on_Transcript($tr, [$seqed->get_Attribute]);
+
+        # adjust cdna coordinates for frameshifted basepairs
+        if($tr->translation) {
+          if($cdna_coding_start >= $cdna_start) {
+            $cdna_coding_start += $fs->length();
          }
-
-          # merge supporting evidence
-          foreach my $ev (@{$ex->get_all_supporting_features()}) {
-            if(!$seen_evidence{$ev->dbID()}) {
-              $merging_exon->add_supporting_features($ev);
-              $seen_evidence{$ev->dbID()} = 1;
-            }
+          if($cdna_coding_end >= $cdna_start) {
+            $cdna_coding_end += $fs->length();
          }
-
-          # store frameshift as transcript attrib
-          my $seqed = Bio::EnsEMBL::SeqEdit->new
-            (-CODE    => '_rna_edit',
-             -NAME    => 'RNA Edit',
-             -DESC    => 'Post transcriptional RNA edit',
-             -START   => $cdna_start,
-             -END     => $cdna_start + $fs->length() - 1,
-             -ALT_SEQ => '');
-          $aa->store_on_Transcript($tr, [$seqed->get_Attribute]);
-
-          # take this frameshift and the next exon
-          # (which was merged into the current exon) off the list
-          $fs = shift(@frameshifts);
-          $ex = shift(@old_exons);
-          $cdna_start += $ex->length();
        }
-      } else {
-        # this exon does not have a frameshift so just add it to the list
+        $cdna_start += $fs->length();

+        # look at the next frameshift
+        $fs = shift(@frameshifts);
+      } else {
        push @exons, $ex;
+
+        %seen_evidence = ();
+        foreach my $sf (@{$ex->get_all_supporting_features()}) {
+          $seen_evidence{ref($_) . ':' . $_->dbID()} = 1;
+        }
      }
+
+      $cdna_start += $ex->length();
    }

+    # rebuild transcript from new set of exons
    $tr->flush_Exons();

    foreach my $ex (@exons) {
      $new_exons{$ex->hashkey} = $ex;
      $tr->add_Exon($ex);
    }
+
+    $tl_cdna_starts{$tr->dbID()} = $cdna_coding_start;
+    $tl_cdna_ends{$tr->dbID()} = $cdna_coding_end;
  }

  # determine which exons can be deleted
@@ -200,6 +226,7 @@ while(my $array = $sth->fetchrow_arrayref()) {
    }
  }

+  # update the transcript composition by updating the exon transcript table
  foreach my $tr (@{$g->get_all_Transcripts()}) {
    $del_et_sth->execute($tr->dbID());

@@ -218,11 +245,57 @@ while(my $array = $sth->fetchrow_arrayref()) {
      $rank++;
    }
  }
+
+  # update the translations to use the new exons
+  foreach my $tr (@{$g->get_all_Transcripts()}) {
+    my $tl = $tr->translation();
+
+    next if(!$tl);
+
+    my $tl_start = $tl_cdna_starts{$tr->dbID()};
+    my $tl_end   = $tl_cdna_ends{$tr->dbID()};
+
+    foreach my $ex (@{$tr->get_all_Exons()}) {
+      if($tl_start > 0 && $tl_start <= $ex->length()) {
+        $tl->start_Exon($ex);
+        $tl->start($tl_start);
+      }
+
+      if($tl_end > 0 && $tl_end <= $ex->length()) {
+        $tl->end_Exon($ex);
+        $tl->end($tl_end);
+      }
+
+      $tl_start -= $ex->length();
+      $tl_end   -= $ex->length();
+    }
+
+    my ($start_ex_id, $end_ex_id);
+
+    # use consolidated exon ids. Exons may have been duplicated across
+    # multiple transcripts but only one has been stored and had id set.
+    if($old_exons{$tl->start_Exon->hashkey()}) {
+      $start_ex_id = $old_exons{$tl->start_Exon->hashkey()}->dbID();
+    } else {
+      $start_ex_id = $new_exons{$tl->start_Exon->hashkey()}->dbID();
+    }
+
+    if($old_exons{$tl->end_Exon->hashkey()}) {
+      $end_ex_id = $old_exons{$tl->end_Exon->hashkey()}->dbID();
+    } else {
+      $end_ex_id = $new_exons{$tl->end_Exon->hashkey()}->dbID();
+    }
+
+
+    $upd_tl_sth->execute($start_ex_id, $end_ex_id,
+                         $tl->start(), $tl->end(), $tl->dbID());
+  }
 }

 $del_et_sth->finish();
 $ins_et_sth->finish();
 $sth->finish();
+$upd_tl_sth->finish();

 print STDERR "\nAll Done.\n";