Skip to content
Snippets Groups Projects
Commit a83d08ac authored by Graham McVicker's avatar Graham McVicker
Browse files

fixed some bugs, more debug info & sanity checks

parent b0a5960a
No related branches found
No related tags found
No related merge requests found
......@@ -247,6 +247,9 @@ sub process_cds_delete {
$transcript->move_cdna_coding_end(-$del_len);
if($frameshift && !$entire_delete) {
print STDERR "BEFORE CDS INSERT:\n";
print_exon($exon, $transcript);
$code |= StatMsg::FRAMESHIFT if($frameshift);
# this is going to require splitting the exon
......@@ -269,6 +272,14 @@ sub process_cds_delete {
# very short exons can be entirely consumed by the intron
if($intron_len == $exon->length()) {
# still adjust this 0 length intron, because its length
# is used in transcript splitting calculations
$exon->cdna_end($exon->cdna_end - $intron_len);
if($exon->strand() == 1) {
$exon->end($exon->end - $intron_len);
} else {
$exon->start($exon->start + $intron_len);
}
$code |= StatMsg::ALL_INTRON;
$exon->fail(1);
}
......@@ -304,6 +315,9 @@ sub process_cds_delete {
$first_exon->end($first_exon->start() + $first_len - 1);
$transcript->add_Exon($first_exon);
print STDERR "FIRST EXON:\n";
print_exon($first_exon, $transcript);
$exon->cdna_start($first_exon->cdna_end() + 1);
$exon->flush_StatMsgs();
}
......@@ -321,6 +335,9 @@ sub process_cds_delete {
$first_exon->start($exon->end() - $first_len + 1);
$transcript->add_Exon($first_exon);
print STDERR "FIRST EXON:\n";
print_exon($first_exon, $transcript);
$exon->cdna_start($first_exon->cdna_end() + 1);
$exon->flush_StatMsgs();
}
......@@ -330,6 +347,9 @@ sub process_cds_delete {
$exon->cdna_end($exon->cdna_end - $intron_len);
}
}
print STDERR "AFTER CDS INSERT:\n";
print_exon($exon, $transcript);
}
}
......@@ -348,18 +368,30 @@ sub process_cds_delete {
}
sub print_exon {
my $exon = shift;
my $tr = shift;
if (!$exon) {
throw("Exon undefined");
}
print STDERR "EXON:\n";
print STDERR " cdna_start = ". $exon->cdna_start() . "\n";
print STDERR " cdna_end = ". $exon->cdna_end() . "\n";
print STDERR " start = ". $exon->start() . "\n";
print STDERR " end = ". $exon->end() . "\n";
print STDERR " strand = ". $exon->strand() . "\n\n";
if($tr) {
print STDERR "TRANSCRIPT:\n";
print STDERR " cdna_coding_start = ". $tr->cdna_coding_start() . "\n";
print STDERR " cdna_coding_end = ". $tr->cdna_coding_end(). "\n";
}
return;
}
#sub print_exon {
#my $exon = shift;
#
#print "EXON:\n";
#print "cdna_start = ". $exon->cdna_start() . "\n";
#print "cdna_end = ". $exon->cdna_end() . "\n";
#print "start = ". $exon->start() . "\n";
#print "end = ". $exon->end() . "\n";
#print "strand = ". $exon->strand() . "\n\n";
#return;
#}
1;
......@@ -52,8 +52,8 @@ sub process_insert {
$$cdna_ins_pos_ref < $transcript->cdna_coding_end()) {
info("insertion in cds ($insert_len)");
#print "BEFORE CDS INSERT:\n";
#print_exon($exon, $transcript);
print "BEFORE CDS INSERT:\n";
print_exon($exon, $transcript);
$code |= StatMsg::CDS;
......@@ -108,10 +108,12 @@ sub process_insert {
# start the next exon after the frameshift intron
$exon->end($exon->end() - ($first_len + $frameshift));
}
$transcript->add_Exon($first_exon);
}
#print "AFTER CDS INSERT:\n";
#print_exon($exon, $transcript);
print STDERR "AFTER CDS INSERT:\n";
print_exon($exon, $transcript);
}
......
......@@ -4,7 +4,7 @@ use warnings;
package InterimExon;
use Bio::EnsEMBL::Utils::Exception qw(info);
use Bio::EnsEMBL::Utils::Exception qw(info warning);
use StatMsg;
......@@ -143,7 +143,13 @@ sub end_phase {
sub fail {
my $self = shift;
$self->{'fail'} = shift if(@_);
if(@_) {
my $fail = shift;
warning("Setting ".$self->stable_id." to failed.\n") if($fail);
$self->{'fail'} = $fail;
}
return $self->{'fail'};
}
......
......@@ -3,6 +3,10 @@ use warnings;
package InterimTranscript;
use Bio::EnsEMBL::Utils::Exception qw(warning);
sub new {
my $class = shift;
......@@ -33,7 +37,9 @@ sub last_StatMsg {
sub add_Exon {
my $self = shift;
push @{$self->{'exons'}}, shift;
my $exon = shift;
push @{$self->{'exons'}}, $exon;
}
sub get_all_Exons {
......
......@@ -8,6 +8,7 @@ use warnings;
package Length;
use StatMsg;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use constant MEDIUM => 9; # 8 or less is short
use constant LONG => 19; # 9-18 is medium, >18 is long
......@@ -44,7 +45,9 @@ sub length2code {
my $length = shift;
return StatMsg::SHORT if(is_short($length));
return StatMsg::MEDIUM if(is_medium($length));
return StatMsg::LONG if(is_medium($length));
return StatMsg::LONG if(is_long($length));
throw("Could not resolve length code for length=$length");
}
......
......@@ -120,12 +120,16 @@ sub check_iexons {
debug("checking exons for : " . $itranscript->stable_id());
my $first = 1;
foreach my $iexon (@{$itranscript->get_all_Exons}) {
#
# TBD being 'fatal' can have some knockon effects
# such as the next exon being part of the error (when split due to
# frameshifting). Do something about this...
# Maybe we should throw out all exons with same human stable identifier
# when one of them is fatal?
#
if ($iexon->fail() || $iexon->is_fatal()) {
debug(" failed/fatal exon, splitting transcript");
......@@ -141,6 +145,13 @@ sub check_iexons {
return check_iexons($itranscript, $itranscript_array);
}
#sanity check: expect first exon to have cdna_start = 1
if($first && $iexon->cdna_start != 1) {
print_exon($iexon);
throw("Unexpected: first exon does not have cdna_start = 1");
}
$first = 0;
# sanity check: start must be less than or equal to end
if ($iexon->end() < $iexon->start()) {
throw("Unexpected: exon start less than end:\n" .
......@@ -163,7 +174,8 @@ sub check_iexons {
my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT|StatMsg::SCAFFOLD_SPAN);
$itranscript->add_StatMsg($stat_msg);
my $first_transcript = split_itrans($itranscript, $iexon);
my $keep_exon = 1;
my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
if ($first_transcript) {
$itranscript_array ||= [];
......@@ -182,7 +194,8 @@ sub check_iexons {
my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::INVERT);
$itranscript->add_StatMsg($stat_msg);
my $first_transcript = split_itrans($itranscript, $iexon);
my $keep_exon = 1;
my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
if ($first_transcript) {
$itranscript_array ||= [];
......@@ -198,7 +211,8 @@ sub check_iexons {
my $stat_msg = StatMsg->new(StatMsg::TRANSCRIPT | StatMsg::STRAND_FLIP);
$itranscript->add_StatMsg($stat_msg);
my $first_transcript = split_itrans($itranscript, $iexon);
my $keep_exon = 1;
my $first_transcript = split_itrans($itranscript, $iexon, $keep_exon);
if ($first_transcript) {
$itranscript_array ||= [];
......@@ -226,41 +240,57 @@ sub check_iexons {
#
# splits an interim transcript into two parts by discarding
# Splits an interim transcript into two parts by discarding
# a 'bad exon' in the middle.
#
# If the 'bad exon' was the first exon then undef is returned, otherwise
# the first transcript is returned.
# the passed in transcript is adjusted to become the second transcript
#
# The passed in transcript is adjusted to become the second transcript
#
# The keep_exon argument is a flag. If true the 'bad exon' is kept as part
# of the second transcript, otherwise it is discarded.
#
sub split_itrans {
my $itrans = shift;
my $bad_exon = shift; #fulcrum exon to split on
my $keep_exon = shift;
my @all_exons = @{$itrans->get_all_Exons()};
my @remaining_exons = @{$itrans->get_all_Exons()};
my @first_exons;
my @second_exons;
my $first_trans = InterimTranscript->new();
$first_trans->stable_id($itrans->stable_id());
my $cur_exon = shift(@all_exons);
my $cur_exon = shift(@remaining_exons);
# all the exons until the 'bad exon' are loaded into the first transcript
debug("==FIRST TRANSCRIPT:\n");
while($cur_exon && $cur_exon != $bad_exon) {
print_exon($cur_exon);
push @first_exons, $cur_exon;
$first_trans->add_Exon($cur_exon);
$cur_exon = shift(@all_exons);
$cur_exon = shift(@remaining_exons);
}
if(!$cur_exon) {
throw("unexpected: could not find bad exon in transcript");
}
debug("==BAD EXON: (keep = $keep_exon)\n");
print_exon($bad_exon);
# keep the 'bad exon' if the flag was set
if($keep_exon) {
unshift @remaining_exons, $bad_exon;
}
# the remaining exons are reloaded into the second (original) transcript
$itrans->flush_Exons();
while(my $ex = shift(@all_exons)) {
push @second_exons, $ex;
$itrans->add_Exon($ex);
debug("==SECOND TRANSCRIPT:\n");
foreach my $exon (@remaining_exons) {
print_exon($exon);
$itrans->add_Exon($exon);
}
#
......@@ -271,7 +301,7 @@ sub split_itrans {
my $last_ex = $first_exons[$#first_exons];
# set the coding start and coding end to same as original transcript,
# then adjust for ommitted
# then adjust for ommitted exons
$first_trans->cdna_coding_start($itrans->cdna_coding_start());
$first_trans->cdna_coding_end($itrans->cdna_coding_end());
......@@ -282,7 +312,7 @@ sub split_itrans {
$first_trans->cdna_coding_end(0);
}
elsif($last_ex->cdna_end() > $first_trans->cdna_coding_start() &&
$last_ex->cdna_end() < $first_trans->cdna_coding_end()) {
$last_ex->cdna_end() < $first_trans->cdna_coding_end()) {
# coding sequence is cut by coding end
$first_trans->cdna_coding_end($last_ex->cdna_end());
}
......@@ -291,22 +321,31 @@ sub split_itrans {
#
# adjust the coding end/start of the second transcript
#
if(@second_exons) {
my $first_ex = $second_exons[0];
my $last_ex = $second_exons[$#second_exons];
if(@remaining_exons) {
my $first_ex = $remaining_exons[0];
my $last_ex = $remaining_exons[$#remaining_exons];
# all of the cdna coodinates need to be shifted up by the
# amount of bases used by the first transcript & bad exon
my $cdna_shift = 0;
foreach my $ex (@first_exons) {
$cdna_shift += $ex->length();
# debug("cdna_shift = $cdna_shift\n");
}
if(!$bad_exon->fail()) {
# if we threw away the 'bad' exon we want to take into account
# how much sequence it had,
if(!$keep_exon &&
defined($bad_exon->cdna_start()) &&
defined($bad_exon->cdna_end())) {
$cdna_shift += $bad_exon->length();
}
# debug("Shifting CDNA of second transcript by $cdna_shift\n");
# debug("First exon (2nd trans) cdna_start = ".$remaining_exons[0]->cdna_start()."\n");
if($cdna_shift) {
foreach my $ex (@second_exons) {
foreach my $ex (@remaining_exons) {
if(!$ex->fail()) {
$ex->cdna_start($ex->cdna_start() - $cdna_shift);
$ex->cdna_end($ex->cdna_end() - $cdna_shift);
......@@ -326,9 +365,7 @@ sub split_itrans {
}
}
return $first_trans if(@first_exons);
return undef;
return (@first_exons) ? $first_trans : undef;
}
#
......@@ -342,7 +379,7 @@ sub make_Transcript {
my $transcript = Bio::EnsEMBL::Transcript->new();
$transcript->stable_id($itrans->stable_id);
debug("making final transcript for ". $itrans->stable_id);
# debug("making final transcript for ". $itrans->stable_id);
my $translation;
......@@ -384,12 +421,12 @@ sub make_Transcript {
if ($iexon->cdna_start() <= $itrans->cdna_coding_end() &&
$iexon->cdna_end() >= $itrans->cdna_coding_end()) {
debug("end exon=".$exon->stable_id()."\n".
" ex_coding_start=".$iexon->cdna_start()."\n".
" ex_coding_end=".$iexon->cdna_end()."\n".
" start=".$iexon->start()."\n".
" end=".$iexon->end()."\n".
" transl_end = ".$itrans->cdna_coding_end());
#debug("end exon=".$exon->stable_id()."\n".
# " ex_coding_start=".$iexon->cdna_start()."\n".
# " ex_coding_end=".$iexon->cdna_end()."\n".
# " start=".$iexon->start()."\n".
# " end=".$iexon->end()."\n".
# " transl_end = ".$itrans->cdna_coding_end());
my $translation_end =
$itrans->cdna_coding_end() - $iexon->cdna_start() + 1;
......@@ -399,9 +436,62 @@ sub make_Transcript {
}
}
if($translation && !$translation->start_Exon()) {
print STDERR "Could not find translation start exon in transcript.\n";
print STDERR "FIRST EXON:\n";
print_exon($itrans->get_all_Exons->[0]);
print STDERR "LAST EXON:\n";
print_exon($itrans->get_all_Exons->[-1], $itrans);
}
if($translation && !$translation->end_Exon()) {
print STDERR "Could not find translation end exon in transcript.\n";
print STDERR "FIRST EXON:\n";
print_exon($itrans->get_all_Exons->[0]);
print STDERR "LAST EXON:\n";
print_exon($itrans->get_all_Exons->[-1], $itrans);
}
return $transcript;
}
sub print_exon {
my $exon = shift;
my $tr = shift;
if (!$exon) {
throw("Exon undefined");
}
print STDERR " ",$exon->stable_id, "\n";
print STDERR " cdna_start = ",$exon->cdna_start(),"\n"
if(defined($exon->cdna_start()));
print STDERR " cdna_end = ", $exon->cdna_end(), "\n"
if(defined($exon->cdna_end()));
print STDERR " start = ". $exon->start() . "\n"
if(defined($exon->start()));
print STDERR " end = ". $exon->end() . "\n"
if(defined($exon->end()));
print STDERR " strand = ". $exon->strand() . "\n"
if(defined($exon->strand()));
if($exon->fail) {
print STDERR " FAILED\n";
}
if($tr) {
print STDERR " TRANSCRIPT:\n";
print STDERR " cdna_coding_start = ". $tr->cdna_coding_start() . "\n";
print STDERR " cdna_coding_end = ". $tr->cdna_coding_end(). "\n\n";
}
return;
}
sub debug {
my $msg = shift;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment