From d322d934a653d7724ed3b0d94085fac0c854366d Mon Sep 17 00:00:00 2001
From: Matthew Laird <lairdm@ebi.ac.uk>
Date: Wed, 1 Jun 2016 16:35:36 +0100
Subject: [PATCH] Fix for ENSCORESW-1816, bioperl's inconsistent handling of
 incomplete codons across versions. Trim any partial codons when translating
 unless told not to.

---
 modules/Bio/EnsEMBL/PredictionTranscript.pm | 29 ++++++++++++++++-----
 modules/Bio/EnsEMBL/Transcript.pm           | 19 ++++++++++++--
 modules/t/transcript.t                      |  6 +++++
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/modules/Bio/EnsEMBL/PredictionTranscript.pm b/modules/Bio/EnsEMBL/PredictionTranscript.pm
index 9ce0ab4b7d..041247253d 100644
--- a/modules/Bio/EnsEMBL/PredictionTranscript.pm
+++ b/modules/Bio/EnsEMBL/PredictionTranscript.pm
@@ -331,7 +331,8 @@ sub translation {
 
 =head2 translate
 
-  Args      : none
+  Arg [1]   : Boolean, emulate the behavior of old bioperl versions where
+              an incomplete final codon of 2 characters is padded and guessed
   Function  : Give a peptide translation of all exons currently in
               the PT. Gives empty string when none is in.
   Returntype: a Bio::Seq as in transcript->translate()
@@ -343,7 +344,7 @@ sub translation {
 
 
 sub translate {
-  my ($self) = @_;
+  my ($self, $complete_codon) = @_;
 
   my $dna = $self->translateable_seq();
 
@@ -358,6 +359,11 @@ sub translate {
   }
   $codon_table_id ||= 1; #default will be vertebrates
 
+  # Remove the final stop codon from the mrna
+  # sequence produced if it is present, this is so any peptide produced
+  # won't have a terminal stop codon
+  # if you want to have a terminal stop codon either comment this line out
+  # or call translatable seq directly and produce a translation from it
   if( CORE::length( $dna ) % 3 == 0 ) {
    # $dna =~ s/TAG$|TGA$|TAA$//i;
       my $codon_table =  Bio::Tools::CodonTable->new( -id => $codon_table_id );
@@ -365,12 +371,21 @@ sub translate {
       if ( $codon_table->is_ter_codon( substr( $dna, -3, 3 ) ) ) {
 	  substr( $dna, -3, 3, '' );
       }
+  } elsif ( CORE::length($dna) % 3 == 2 ) {
+      # If we have a partial codon of 2 bp we need to decide if we
+      # trim it or not to fix some bad behaviour in older bioperl
+      # versions
+      if ( $complete_codon ) {
+	  # If we want to do the bad behavior of bioperl 1.6.1 and older
+	  # where we guess the last codon if inomplete, pad an N
+	  # to the mrna sequence
+	  $dna .= 'N';
+      } else {
+	  # Otherwise trim those last two bp off so the behavior is
+	  # consistent across bioperl versions
+	  substr( $dna, -2, 2, '' );
+      }
   }
-  # the above line will remove the final stop codon from the mrna
-  # sequence produced if it is present, this is so any peptide produced
-  # won't have a terminal stop codon
-  # if you want to have a terminal stop codon either comment this line out
-  # or call translatable seq directly and produce a translation from it
 
   my $bioseq = new Bio::Seq( -id       => $self->display_id,
                              -seq      => $dna,
diff --git a/modules/Bio/EnsEMBL/Transcript.pm b/modules/Bio/EnsEMBL/Transcript.pm
index e41d0ffd81..9da16ac06b 100755
--- a/modules/Bio/EnsEMBL/Transcript.pm
+++ b/modules/Bio/EnsEMBL/Transcript.pm
@@ -2092,7 +2092,8 @@ sub get_all_translateable_Exons {
 
 =head2 translate
 
-  Args       : none
+  Arg [1]    : Boolean, emulate the behavior of old bioperl versions where
+               an incomplete final codon of 2 characters is padded and guessed
   Example    : none
   Description: Return the peptide (plus eventual stop codon) for
                this transcript.  Does N-padding of non-phase
@@ -2107,7 +2108,7 @@ sub get_all_translateable_Exons {
 =cut
 
 sub translate {
-  my ($self) = @_;
+  my ($self, $complete_codon) = @_;
 
   if ( !defined( $self->translation() ) ) { return undef }
 
@@ -2152,6 +2153,20 @@ sub translate {
     if ( $codon_table->is_ter_codon( substr( $mrna, -3, 3 ) ) ) {
       substr( $mrna, -3, 3, '' );
     }
+  } elsif ( CORE::length($mrna) % 3 == 2 ) {
+      # If we have a partial codon of 2 bp we need to decide if we
+      # trim it or not to fix some bad behaviour in older bioperl
+      # versions
+      if ( $complete_codon ) {
+	  # If we want to do the bad behavior of bioperl 1.6.1 and older
+	  # where we guess the last codon if inomplete, pad an N
+	  # to the mrna sequence
+	  $mrna .= 'N';
+      } else {
+	  # Otherwise trim those last two bp off so the behavior is
+	  # consistent across bioperl versions
+	  substr( $mrna, -2, 2, '' );
+      }
   }
 
   if ( CORE::length($mrna) < 1 ) { return undef }
diff --git a/modules/t/transcript.t b/modules/t/transcript.t
index 77061d6a64..9b5cf356f3 100644
--- a/modules/t/transcript.t
+++ b/modules/t/transcript.t
@@ -210,6 +210,12 @@ $tr->flush_Exons();
 
 is( scalar( @{$tr->get_all_Exons()} ), 0, 'No exons left after flushing' );
 
+# Fetch a fresh tr, check incomplete codon behavior
+$tr = $ta->fetch_by_stable_id( "ENST00000300425" );
+
+# By default the incomplete codon should be dropped
+is( $tr->translate()->seq() =~ /P$/, 1, "Incomplete codon is not translated");
+is( $tr->translate(1)->seq() =~ /PL$/, 1, "Incomplete codon is padded then translated");
 
 # get a fresh tr to check the update method
 $tr = $ta->fetch_by_stable_id( "ENST00000217347" );
-- 
GitLab