diff --git a/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl b/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl index 49d74da433ec89dc4b8d9902091012cc8b17de62..ca7e41dd86b3de4da6c821065d058a1ceb119055 100644 --- a/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl +++ b/misc-scripts/canonical_transcripts/set_canonical_transcripts.pl @@ -1,20 +1,33 @@ +=pod -#!/usr/local/ensembl/bin/perl +SYNOPSIS + + Script to set canonical transcripts for each gene + +DESCRIPTION + + The rule is if the cluster contains translated transcripts take the + one with the longest cds otherwise take the one with the longest + cdna + +EXAMPLE + + perl set_canonical_transcripts.pl -dbhost host -dbuser user -dbpass *** -dbname dbname -dbport 3306 -coord_system toplevel -write + +EXTRA -#script to set canonical transcripts for each gene -#the rule is if the cluster contains translated transcripts take the one with the -#longest cds otherwise take the one with the longest cdna + To check the script has run correctly you can run the + CoreForeignKeys healthcheck: -#an example commandline is -# -# perl set_canonical_transcripts.pl -dbhost host -dbuser user -dbpass *** -dbname -# dbname -dbport 3306 -coord_system toplevel -write -# + ./run-healthcheck.sh -d dbname -output problem CoreForeignKeys -#To check the script has run correctly you can run the CoreForeignKeys healthcheck -# e.g ./run-healthcheck.sh -d dbname -output problem CoreForeignKeys +=cut + +#!/usr/local/ensembl/bin/perl use strict; +use Carp; +use DBI qw(:sql_types); use Getopt::Long; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::Utils::Exception qw(throw); @@ -30,20 +43,20 @@ my $write = 0; my $include_non_ref = 1; my $verbose = 0; -&GetOptions( 'dbhost:s' => \$host, - 'dbport:n' => \$port, - 'dbname:s' => \$dbname, - 'dbuser:s' => \$user, - 'dbpass:s' => \$pass, - 'coord_system_name:s' => \$coord_system, - 'seq_region_name:s' => \$seq_region_name, - 'write!' => \$write, - 'include_non_ref!' => \$include_non_ref, - 'verbose!' => \$verbose, ); +GetOptions( 'dbhost:s' => \$host, + 'dbport:n' => \$port, + 'dbname:s' => \$dbname, + 'dbuser:s' => \$user, + 'dbpass:s' => \$pass, + 'coord_system_name:s' => \$coord_system, + 'seq_region_name:s' => \$seq_region_name, + 'write!' => \$write, + 'include_non_ref!' => \$include_non_ref, + 'verbose!' => \$verbose, ); unless ($write) { - print -" you've not used the -write option so results will not be written into the database\n"; + print "You have not used the -write option " + . "so results will not be written into the database\n"; } my $db = @@ -51,52 +64,91 @@ my $db = -user => $user, -port => $port, -dbname => $dbname, - -pass => $pass, ); + -pass => $pass ); my $sa = $db->get_SliceAdaptor; my $slices; +# Only update the canonical transcripts in this region. if ($seq_region_name) { + print "Only updating genes on chromosome $seq_region_name\n"; my $slice = $sa->fetch_by_region( $coord_system, $seq_region_name, $include_non_ref ); push( @$slices, $slice ); + + my $update_to_null = q( + UPDATE gene + SET canonical_transcript_id = NULL, + canonical_annotation = NULL + WHERE seq_region_id = ?); + my $seq_region_id = $slice->get_seq_region_id; + + my $sth = $db->dbc->prepare($update_to_null); + $sth->bind_param(1, $seq_region_id, SQL_INTEGER); + $sth->execute(); + } else { $slices = $sa->fetch_all( $coord_system, '', $include_non_ref ); -} -my $update_to_null = "update gene set canonical_transcript_id = NULL, canonical_annotation = NULL"; -$db->dbc->do($update_to_null); -# get $db->dbc->db_handle->do($update_to_null) instead if above not working + my $update_to_null = q(update gene set canonical_transcript_id = NULL, canonical_annotation = NULL); + $db->dbc->do($update_to_null); + # get $db->dbc->db_handle->do($update_to_null) instead if above not working +} my $gene_update_sql = "update gene set canonical_transcript_id = ? where gene_id = ?"; -my $sth = $db->dbc->prepare($gene_update_sql); +my $gene_sth = $db->dbc->prepare($gene_update_sql); + SLICE: foreach my $slice (@$slices) { print "Getting genes for " . $slice->name . "\n" if ($verbose); my $genes = $slice->get_all_Genes( undef, undef, 1 ); my %canonical; + GENE: foreach my $gene (@$genes) { - print "Updating gene: ", $gene->dbID, "\n"; + print "Looking at gene: ", $gene->dbID, "\n"; my $transcripts = $gene->get_all_Transcripts; if ( @$transcripts == 1 ) { $canonical{ $gene->dbID } = $transcripts->[0]->dbID; next GENE; } + + # Keep track of the transcript biotypes + my %trans; + foreach my $transcript ( @{$transcripts} ) { + push @{ $trans{ $transcript->biotype } }, $transcript; + } + + # If there is a single protein_coding transcript, make it the + # canonical transcript, if it has a functional translation. + foreach my $key ( keys(%trans) ) { + if ( ( $key eq 'protein_coding' ) && ( @{ $trans{$key} } == 1 ) ) { + my $trans_id = $trans{$key}->[0]->dbID; + unless ( $trans{$key}->[0]->translation->seq =~ /\*/ ) { + $canonical{ $gene->dbID } = $trans_id; + next GENE; + } else { + carp( "Transcript $trans_id has internal stop(s) " + . "and it shouldn't if it is protein coding\n" ); + } + } + } + my $has_translation = 0; my $count = 0; my @with_translation; my @no_translation; - foreach my $transcript (@$transcripts) { - if ( $transcript->translation + foreach my $transcript ( @{$transcripts} ) { + + if ( ( $gene->biotype ne 'pseudogene' ) && ( $gene->biotype ne 'processed_transcript' ) - && ( $gene->biotype ne 'pseudogene' ) && ( $transcript->biotype ne 'nonsense_mediated_decay' ) - && ( $transcript->biotype ne 'processed_transcript' ) ) + && ( $transcript->biotype ne 'processed_transcript' ) + && $transcript->translation + && ( $transcript->translation->seq !~ /\*/ ) ) + { - unless ( $transcript->translation->seq =~ /\*/ ) { - push( @with_translation, $transcript ); - } + push( @with_translation, $transcript ); } else { push( @no_translation, $transcript ); } @@ -120,9 +172,11 @@ GENE: foreach my $gene (@$genes) { } $canonical{ $gene->dbID } = $sorted[0]->dbID; } ## end foreach my $gene (@$genes) + foreach my $gene_id ( keys(%canonical) ) { my $transcript_id = $canonical{$gene_id}; - $sth->execute( $transcript_id, $gene_id ) if ($write); + print "Updating gene $gene_id\n"; + $gene_sth->execute( $transcript_id, $gene_id ) if ($write); } } ## end foreach my $slice (@$slices)