From 4121009c7b2e2171f0a593009be5c0f535cb02b6 Mon Sep 17 00:00:00 2001
From: Amonida Zadissa <amonida@ebi.ac.uk>
Date: Tue, 24 Aug 2010 10:43:23 +0000
Subject: [PATCH] Use the following order of biotypes when setting canonical
 transcripts:

1. protein_coding
2. nonsense_mediated_decay (if translatable)
3. non coding (may include NMD)

Ue non 'protein_coding' labelled transcript only if there are no
'protein_coding' labelled transcripts present.

Added OPTIONS section to DOC.
---
 .../set_canonical_transcripts.pl              | 102 +++++++++++++++---
 1 file changed, 87 insertions(+), 15 deletions(-)

diff --git a/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl b/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl
index a94b22f4e0..f6fb97cdbf 100644
--- a/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl
+++ b/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl
@@ -1,20 +1,46 @@
 =pod
 
-SYNOPSIS
+=head1 SYNOPSIS
 
   Script to set canonical transcripts for each gene
 
-DESCRIPTION
+=head1 DESCRIPTION
 
   The rule is if the cluster contains translated transcripts take the
   one with the longest cds otherwise take the one with the longest
   cdna
 
-EXAMPLE
+=head1 OPTIONS
+
+=head2 Required arguments
+
+  -dbname       Databsse name
+
+  -dbhost       Database host
+
+  -dbport       Database port
+
+  -dbuser       Database user
+
+  -dbpass       Database password
+
+=head2  Optional arguments
+
+  -coord_system_name    Coordinate system to use
+
+  -include_non_ref      Specify if the non_reference regions should be _excluded_. (default: include) 
+
+  -seq_region_name      Chromosome name if running a single seq_region
+
+  -write                Specify if results should be written to the database
+
+  -verbose              Increase verbosity of output messages
+
+=head1 EXAMPLE
 
   perl set_canonical_transcripts.pl -dbhost host -dbuser user -dbpass *** -dbname dbname -dbport 3306 -coord_system toplevel -write
 
-EXTRA
+=head1 EXTRA
 
   To check the script has run correctly you can run the
   CoreForeignKeys healthcheck:
@@ -26,7 +52,9 @@ EXTRA
 #!/usr/local/ensembl/bin/perl
 
 use strict;
+use warnings;
 use Carp;
+use Data::Dumper;
 use DBI qw(:sql_types);
 use Getopt::Long;
 use Bio::EnsEMBL::DBSQL::DBAdaptor;
@@ -89,7 +117,7 @@ SLICE: foreach my $slice (@$slices) {
   my %canonical;
 
 GENE: foreach my $gene (@$genes) {
-    print "Looking at gene: ", $gene->dbID, "\n";
+    print "Looking at gene: (" . $gene->dbID . ") :: " . $gene->stable_id . "\n";
     my $transcripts = $gene->get_all_Transcripts;
     if ( @$transcripts == 1 ) {
       $canonical{ $gene->dbID } = $transcripts->[0]->dbID;
@@ -107,12 +135,31 @@ GENE: foreach my $gene (@$genes) {
     foreach my $key ( keys(%trans) ) {
       if ( ( $key eq 'protein_coding' ) && ( @{ $trans{$key} } == 1 ) ) {
         my $trans_id = $trans{$key}->[0]->dbID;
-        unless ( $trans{$key}->[0]->translation->seq =~ /\*/ ) {
+        if ( $trans{$key}->[0]->translation->seq !~ /\*/ ) {
           $canonical{ $gene->dbID } = $trans_id;
           next GENE;
         } else {
           carp(   "Transcript $trans_id has internal stop(s) "
-                . "and it shouldn't if it is protein coding\n" );
+                . "and shouldn't if it is protein coding\n" );
+        }
+      }
+      elsif (    !exists( $trans{'protein_coding'} )
+              && $key eq 'nonsense_mediated_decay'
+              && ( @{ $trans{$key} } == 1 ) )
+      {
+        my $trans_id = $trans{$key}->[0]->dbID;
+        if (    ( $trans{$key}->[0]->translation )
+             && ( $trans{$key}->[0]->translation->seq !~ /\*/ ) )
+        {
+          print "The NMD transcript $trans_id will become "
+              . "the canonical transcript because there are no "
+              . "protein coding transcripts.\n";
+
+          $canonical{ $gene->dbID } = $trans_id;
+          next GENE;
+        } else {
+          carp(   "Transcript $trans_id is an NMD with either "
+                . " no translation or internal stop(s), skipping it...\n" );
         }
       }
     }
@@ -124,9 +171,9 @@ GENE: foreach my $gene (@$genes) {
 
     foreach my $transcript ( @{$transcripts} ) {
 
-      if (    ( $gene->biotype ne 'pseudogene' )
+      if (    ( $gene->biotype       ne 'pseudogene' )
            && ( $gene->biotype       ne 'processed_transcript' )
-           && ( $transcript->biotype ne 'nonsense_mediated_decay' )
+           && ( $transcript->biotype ne 'polymorphic_pseudogene' )
            && ( $transcript->biotype ne 'processed_transcript' )
            && $transcript->translation
            && ( $transcript->translation->seq !~ /\*/ ) )
@@ -140,15 +187,39 @@ GENE: foreach my $gene (@$genes) {
 
     my @sorted;
     if (@with_translation) {
-      my @len_and_trans;
+      my ( @prot_coding_len_and_trans, @len_and_trans );
       foreach my $trans (@with_translation) {
-        my $h = { trans => $trans, len => $trans->translate->length };
-        push @len_and_trans, $h;
+
+        if ($verbose) {
+          print $trans->dbID . " :: " . $trans->biotype . " :: " .  $trans->translate->length . "\n";
+        }
+
+        if ( $trans->biotype eq 'protein_coding') {
+          my $h = { trans => $trans, len => $trans->translate->length };
+          push @prot_coding_len_and_trans, $h;
+          next;
+        } else {
+          my $h = { trans => $trans, len => $trans->translate->length };
+          push @len_and_trans, $h;
+        }
+      }
+
+      my @tmp_sorted;
+      if (@prot_coding_len_and_trans) {
+        @tmp_sorted = sort { $b->{len} <=> $a->{len} } @prot_coding_len_and_trans;
+      } else {
+        if ($verbose) {
+          print "There are no 'protein_coding' labelled transcripts, "
+              . "will take the longest of the other coding transcripts\n";
+        }
+        @tmp_sorted = sort { $b->{len} <=> $a->{len} } @len_and_trans;
       }
-      my @tmp_sorted = sort { $b->{len} <=> $a->{len} } @len_and_trans;
 
       foreach my $h (@tmp_sorted) {
-        #print "Adding to sorted " . $h->{trans}->dbID . "\n";
+
+        if ($verbose) {
+          print "Adding to sorted " . $h->{trans}->dbID . " :: " . $h->{trans}->biotype . "\n";
+        }
         push @sorted, $h->{trans};
       }
     } else {
@@ -159,7 +230,8 @@ GENE: foreach my $gene (@$genes) {
 
   foreach my $gene_id ( keys(%canonical) ) {
     my $transcript_id = $canonical{$gene_id};
-    print "Updating gene $gene_id\n";
+    print "Updating gene $gene_id "
+        . "with canonical transcript: $transcript_id.\n";
     $gene_sth->execute( $transcript_id, $gene_id ) if ($write);
   }
 } ## end foreach my $slice (@$slices)
-- 
GitLab