Skip to content
Snippets Groups Projects
Commit 05219a6a authored by Graham McVicker's avatar Graham McVicker
Browse files

now correctly replaces all 1-2bp introns with frameshifts

parent 8152d12a
No related branches found
No related tags found
No related merge requests found
......@@ -6,13 +6,14 @@ use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Getopt::Long;
use POSIX qw(ceil);
my ($host, $port, $user, $pass, $dbname);
my ($host, $port, $user, $pass, $dbname, $testid);
GetOptions('host=s' => \$host,
'user=s' => \$user,
'port=i' => \$port,
'pass=s' => \$pass,
'dbname=s' => \$dbname);
'dbname=s' => \$dbname,
'testid=i' => \$testid);
$port ||= 3306;
......@@ -51,30 +52,45 @@ my $ins_et_sth = $db->dbc->prepare(qq{INSERT INTO exon_transcript
transcript_id = ?,
rank = ?});
my $upd_tl_sth = $db->dbc->prepare(qq{UPDATE translation
SET start_exon_id = ?,
end_exon_id = ?,
seq_start = ?,
seq_end = ?
WHERE translation_id = ?});
my $sth = $db->dbc->prepare(qq{SELECT max(stable_id) FROM exon_stable_id});
$sth->execute();
my $ex_stable_id = $sth->fetchall_arrayref->[0]->[0];
$ex_stable_id++;
$sth->finish();
$sth = $db->dbc()->prepare
(qq{SELECT t.gene_id,
MIN(IF(e1.seq_region_strand = 1,
e2.seq_region_start - e1.seq_region_end - 1,
e1.seq_region_start - e2.seq_region_end - 1)) AS intron_len
FROM exon e1, exon e2, exon_transcript et1, exon_transcript et2,
transcript t
WHERE et1.exon_id = e1.exon_id
AND et2.exon_id = e2.exon_id
AND et1.transcript_id = et2.transcript_id
AND et1.rank = et2.rank - 1
AND et1.transcript_id = t.transcript_id
GROUP BY t.gene_id
HAVING intron_len < 3});
if(!$testid) {
$sth = $db->dbc()->prepare
(qq{SELECT t.gene_id,
MIN(IF(e1.seq_region_strand = 1,
e2.seq_region_start - e1.seq_region_end - 1,
e1.seq_region_start - e2.seq_region_end - 1)) AS intron_len
FROM exon e1, exon e2, exon_transcript et1, exon_transcript et2,
transcript t
WHERE et1.exon_id = e1.exon_id
AND et2.exon_id = e2.exon_id
AND et1.transcript_id = et2.transcript_id
AND et1.rank = et2.rank - 1
AND et1.transcript_id = t.transcript_id
GROUP BY t.gene_id
HAVING intron_len < 3});
$sth->execute();
$sth->execute();
}
my @gene_ids =
($testid) ? ($testid) : map{$_->[0]} @{$sth->fetchall_arrayref()};
my $total_rows = $sth->rows();
my $total_rows = ($testid) ? 1 : $sth->rows();
my $cur_row = 0;
my $last_percent = undef;
......@@ -83,7 +99,7 @@ my $ea = $db->get_ExonAdaptor();
my $aa = $db->get_AttributeAdaptor();
print STDERR "Merging exons and storing frameshifts\n";
while(my $array = $sth->fetchrow_arrayref()) {
foreach my $gene_id (@gene_ids) {
my $percent = ceil(($cur_row++ / $total_rows) * 100);
if(($percent % 5) == 0 &&
......@@ -92,90 +108,100 @@ while(my $array = $sth->fetchrow_arrayref()) {
print STDERR "$percent% complete\n";
}
my $g = $ga->fetch_by_dbID($array->[0]);
my $g = $ga->fetch_by_dbID($gene_id);
my %old_exons = map {$_->hashkey() => $_} @{$g->get_all_Exons()};
my %new_exons = ();
my %tl_cdna_starts;
my %tl_cdna_ends;
foreach my $tr (@{$g->get_all_Transcripts}) {
# keep track of translation cdna coordinates before messing with transcript
my ($cdna_coding_start, $cdna_coding_end);
if($tr->translation()) {
$cdna_coding_start = $tr->cdna_coding_start();
$cdna_coding_end = $tr->cdna_coding_end();
}
my @frameshifts;
# collect list of frameshifts
foreach my $intron (@{$tr->get_all_Introns()}) {
push(@frameshifts, $intron) if($intron->length() < 3);
}
my %seen_evidence;
my @exons;
my $fs = shift(@frameshifts);
my $merging_exon;
my $cdna_start = 1;
my @exons;
my %seen_evidence;
my @old_exons = @{$tr->get_all_Exons()};
while(@old_exons) {
my $ex = shift(@old_exons);
$cdna_start += $ex->length();
if($fs && $fs->prev_Exon->stable_id() eq $ex->stable_id()) {
$merging_exon = undef;
# merge together exons which are seperated by frameshift introns
foreach my $ex (@{$tr->get_all_Exons()}) {
while($fs && $fs->prev_Exon->stable_id() eq $ex->stable_id()) {
# this exon has a frameshift, so merge it with next exon
# and any subsequent exons serperated only by frameshifts
# was the previous exon seperated from this one by a frameshift intron?
if($fs && $fs->next_Exon()->stable_id() eq $ex->stable_id()) {
if(!$merging_exon) {
$merging_exon = {};
%{$merging_exon} = %$ex;
bless $merging_exon, ref($ex);
push @exons, $merging_exon;
# merge this exon with the previous one
if($ex->strand() == 1) {
$exons[$#exons]->end($ex->end());
} else {
$exons[$#exons]->start($ex->start());
}
%seen_evidence = ();
# merge supporting evidence
foreach my $sf (@{$ex->get_all_supporting_features()}) {
if(!$seen_evidence{ref($sf) . ':' . $sf->dbID()}) {
$ex->add_supporting_feature($sf);
$seen_evidence{ref($sf . ':' . $sf->dbID())} = 1;
}
}
if($merging_exon->strand() == 1) {
$merging_exon->end($fs->next_Exon()->end());
} else {
$merging_exon->start($fs->next_Exon()->start());
# store frameshift as transcript attrib
my $seqed = Bio::EnsEMBL::SeqEdit->new
(-CODE => '_rna_edit',
-NAME => 'RNA Edit',
-DESC => 'Post transcriptional RNA edit',
-START => $cdna_start,
-END => $cdna_start + $fs->length() - 1,
-ALT_SEQ => '');
$aa->store_on_Transcript($tr, [$seqed->get_Attribute]);
# adjust cdna coordinates for frameshifted basepairs
if($tr->translation) {
if($cdna_coding_start >= $cdna_start) {
$cdna_coding_start += $fs->length();
}
# merge supporting evidence
foreach my $ev (@{$ex->get_all_supporting_features()}) {
if(!$seen_evidence{$ev->dbID()}) {
$merging_exon->add_supporting_features($ev);
$seen_evidence{$ev->dbID()} = 1;
}
if($cdna_coding_end >= $cdna_start) {
$cdna_coding_end += $fs->length();
}
# store frameshift as transcript attrib
my $seqed = Bio::EnsEMBL::SeqEdit->new
(-CODE => '_rna_edit',
-NAME => 'RNA Edit',
-DESC => 'Post transcriptional RNA edit',
-START => $cdna_start,
-END => $cdna_start + $fs->length() - 1,
-ALT_SEQ => '');
$aa->store_on_Transcript($tr, [$seqed->get_Attribute]);
# take this frameshift and the next exon
# (which was merged into the current exon) off the list
$fs = shift(@frameshifts);
$ex = shift(@old_exons);
$cdna_start += $ex->length();
}
} else {
# this exon does not have a frameshift so just add it to the list
$cdna_start += $fs->length();
# look at the next frameshift
$fs = shift(@frameshifts);
} else {
push @exons, $ex;
%seen_evidence = ();
foreach my $sf (@{$ex->get_all_supporting_features()}) {
$seen_evidence{ref($_) . ':' . $_->dbID()} = 1;
}
}
$cdna_start += $ex->length();
}
# rebuild transcript from new set of exons
$tr->flush_Exons();
foreach my $ex (@exons) {
$new_exons{$ex->hashkey} = $ex;
$tr->add_Exon($ex);
}
$tl_cdna_starts{$tr->dbID()} = $cdna_coding_start;
$tl_cdna_ends{$tr->dbID()} = $cdna_coding_end;
}
# determine which exons can be deleted
......@@ -200,6 +226,7 @@ while(my $array = $sth->fetchrow_arrayref()) {
}
}
# update the transcript composition by updating the exon transcript table
foreach my $tr (@{$g->get_all_Transcripts()}) {
$del_et_sth->execute($tr->dbID());
......@@ -218,11 +245,57 @@ while(my $array = $sth->fetchrow_arrayref()) {
$rank++;
}
}
# update the translations to use the new exons
foreach my $tr (@{$g->get_all_Transcripts()}) {
my $tl = $tr->translation();
next if(!$tl);
my $tl_start = $tl_cdna_starts{$tr->dbID()};
my $tl_end = $tl_cdna_ends{$tr->dbID()};
foreach my $ex (@{$tr->get_all_Exons()}) {
if($tl_start > 0 && $tl_start <= $ex->length()) {
$tl->start_Exon($ex);
$tl->start($tl_start);
}
if($tl_end > 0 && $tl_end <= $ex->length()) {
$tl->end_Exon($ex);
$tl->end($tl_end);
}
$tl_start -= $ex->length();
$tl_end -= $ex->length();
}
my ($start_ex_id, $end_ex_id);
# use consolidated exon ids. Exons may have been duplicated across
# multiple transcripts but only one has been stored and had id set.
if($old_exons{$tl->start_Exon->hashkey()}) {
$start_ex_id = $old_exons{$tl->start_Exon->hashkey()}->dbID();
} else {
$start_ex_id = $new_exons{$tl->start_Exon->hashkey()}->dbID();
}
if($old_exons{$tl->end_Exon->hashkey()}) {
$end_ex_id = $old_exons{$tl->end_Exon->hashkey()}->dbID();
} else {
$end_ex_id = $new_exons{$tl->end_Exon->hashkey()}->dbID();
}
$upd_tl_sth->execute($start_ex_id, $end_ex_id,
$tl->start(), $tl->end(), $tl->dbID());
}
}
$del_et_sth->finish();
$ins_et_sth->finish();
$sth->finish();
$upd_tl_sth->finish();
print STDERR "\nAll Done.\n";
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment