From d322d934a653d7724ed3b0d94085fac0c854366d Mon Sep 17 00:00:00 2001 From: Matthew Laird <lairdm@ebi.ac.uk> Date: Wed, 1 Jun 2016 16:35:36 +0100 Subject: [PATCH] Fix for ENSCORESW-1816, bioperl's inconsistent handling of incomplete codons across versions. Trim any partial codons when translating unless told not to. --- modules/Bio/EnsEMBL/PredictionTranscript.pm | 29 ++++++++++++++++----- modules/Bio/EnsEMBL/Transcript.pm | 19 ++++++++++++-- modules/t/transcript.t | 6 +++++ 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/modules/Bio/EnsEMBL/PredictionTranscript.pm b/modules/Bio/EnsEMBL/PredictionTranscript.pm index 9ce0ab4b7d..041247253d 100644 --- a/modules/Bio/EnsEMBL/PredictionTranscript.pm +++ b/modules/Bio/EnsEMBL/PredictionTranscript.pm @@ -331,7 +331,8 @@ sub translation { =head2 translate - Args : none + Arg [1] : Boolean, emulate the behavior of old bioperl versions where + an incomplete final codon of 2 characters is padded and guessed Function : Give a peptide translation of all exons currently in the PT. Gives empty string when none is in. Returntype: a Bio::Seq as in transcript->translate() @@ -343,7 +344,7 @@ sub translation { sub translate { - my ($self) = @_; + my ($self, $complete_codon) = @_; my $dna = $self->translateable_seq(); @@ -358,6 +359,11 @@ sub translate { } $codon_table_id ||= 1; #default will be vertebrates + # Remove the final stop codon from the mrna + # sequence produced if it is present, this is so any peptide produced + # won't have a terminal stop codon + # if you want to have a terminal stop codon either comment this line out + # or call translatable seq directly and produce a translation from it if( CORE::length( $dna ) % 3 == 0 ) { # $dna =~ s/TAG$|TGA$|TAA$//i; my $codon_table = Bio::Tools::CodonTable->new( -id => $codon_table_id ); @@ -365,12 +371,21 @@ sub translate { if ( $codon_table->is_ter_codon( substr( $dna, -3, 3 ) ) ) { substr( $dna, -3, 3, '' ); } + } elsif ( CORE::length($dna) % 3 == 2 ) { + # If we have a partial codon of 2 bp we need to decide if we + # trim it or not to fix some bad behaviour in older bioperl + # versions + if ( $complete_codon ) { + # If we want to do the bad behavior of bioperl 1.6.1 and older + # where we guess the last codon if inomplete, pad an N + # to the mrna sequence + $dna .= 'N'; + } else { + # Otherwise trim those last two bp off so the behavior is + # consistent across bioperl versions + substr( $dna, -2, 2, '' ); + } } - # the above line will remove the final stop codon from the mrna - # sequence produced if it is present, this is so any peptide produced - # won't have a terminal stop codon - # if you want to have a terminal stop codon either comment this line out - # or call translatable seq directly and produce a translation from it my $bioseq = new Bio::Seq( -id => $self->display_id, -seq => $dna, diff --git a/modules/Bio/EnsEMBL/Transcript.pm b/modules/Bio/EnsEMBL/Transcript.pm index e41d0ffd81..9da16ac06b 100755 --- a/modules/Bio/EnsEMBL/Transcript.pm +++ b/modules/Bio/EnsEMBL/Transcript.pm @@ -2092,7 +2092,8 @@ sub get_all_translateable_Exons { =head2 translate - Args : none + Arg [1] : Boolean, emulate the behavior of old bioperl versions where + an incomplete final codon of 2 characters is padded and guessed Example : none Description: Return the peptide (plus eventual stop codon) for this transcript. Does N-padding of non-phase @@ -2107,7 +2108,7 @@ sub get_all_translateable_Exons { =cut sub translate { - my ($self) = @_; + my ($self, $complete_codon) = @_; if ( !defined( $self->translation() ) ) { return undef } @@ -2152,6 +2153,20 @@ sub translate { if ( $codon_table->is_ter_codon( substr( $mrna, -3, 3 ) ) ) { substr( $mrna, -3, 3, '' ); } + } elsif ( CORE::length($mrna) % 3 == 2 ) { + # If we have a partial codon of 2 bp we need to decide if we + # trim it or not to fix some bad behaviour in older bioperl + # versions + if ( $complete_codon ) { + # If we want to do the bad behavior of bioperl 1.6.1 and older + # where we guess the last codon if inomplete, pad an N + # to the mrna sequence + $mrna .= 'N'; + } else { + # Otherwise trim those last two bp off so the behavior is + # consistent across bioperl versions + substr( $mrna, -2, 2, '' ); + } } if ( CORE::length($mrna) < 1 ) { return undef } diff --git a/modules/t/transcript.t b/modules/t/transcript.t index 77061d6a64..9b5cf356f3 100644 --- a/modules/t/transcript.t +++ b/modules/t/transcript.t @@ -210,6 +210,12 @@ $tr->flush_Exons(); is( scalar( @{$tr->get_all_Exons()} ), 0, 'No exons left after flushing' ); +# Fetch a fresh tr, check incomplete codon behavior +$tr = $ta->fetch_by_stable_id( "ENST00000300425" ); + +# By default the incomplete codon should be dropped +is( $tr->translate()->seq() =~ /P$/, 1, "Incomplete codon is not translated"); +is( $tr->translate(1)->seq() =~ /PL$/, 1, "Incomplete codon is padded then translated"); # get a fresh tr to check the update method $tr = $ta->fetch_by_stable_id( "ENST00000217347" ); -- GitLab