Skip to content
Snippets Groups Projects
Commit 23d93792 authored by Amonida Zadissa's avatar Amonida Zadissa
Browse files

Modifications to the code:

* If there is only one protein_coding transcript in a gene, use it as
  the canonical transcript but make sure it doesn't have internal stops.

* If a seq_region_name is given, zero set the canonical_transcript_id
  and canonical_annotation, otherwise zero set all
  canonical_transcript_ids.

Added minimal doc.
parent 963b1ce3
No related branches found
No related tags found
No related merge requests found
=pod
#!/usr/local/ensembl/bin/perl
SYNOPSIS
Script to set canonical transcripts for each gene
DESCRIPTION
The rule is if the cluster contains translated transcripts take the
one with the longest cds otherwise take the one with the longest
cdna
EXAMPLE
perl set_canonical_transcripts.pl -dbhost host -dbuser user -dbpass *** -dbname dbname -dbport 3306 -coord_system toplevel -write
EXTRA
#script to set canonical transcripts for each gene
#the rule is if the cluster contains translated transcripts take the one with the
#longest cds otherwise take the one with the longest cdna
To check the script has run correctly you can run the
CoreForeignKeys healthcheck:
#an example commandline is
#
# perl set_canonical_transcripts.pl -dbhost host -dbuser user -dbpass *** -dbname
# dbname -dbport 3306 -coord_system toplevel -write
#
./run-healthcheck.sh -d dbname -output problem CoreForeignKeys
#To check the script has run correctly you can run the CoreForeignKeys healthcheck
# e.g ./run-healthcheck.sh -d dbname -output problem CoreForeignKeys
=cut
#!/usr/local/ensembl/bin/perl
use strict;
use Carp;
use DBI qw(:sql_types);
use Getopt::Long;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw);
......@@ -30,20 +43,20 @@ my $write = 0;
my $include_non_ref = 1;
my $verbose = 0;
&GetOptions( 'dbhost:s' => \$host,
'dbport:n' => \$port,
'dbname:s' => \$dbname,
'dbuser:s' => \$user,
'dbpass:s' => \$pass,
'coord_system_name:s' => \$coord_system,
'seq_region_name:s' => \$seq_region_name,
'write!' => \$write,
'include_non_ref!' => \$include_non_ref,
'verbose!' => \$verbose, );
GetOptions( 'dbhost:s' => \$host,
'dbport:n' => \$port,
'dbname:s' => \$dbname,
'dbuser:s' => \$user,
'dbpass:s' => \$pass,
'coord_system_name:s' => \$coord_system,
'seq_region_name:s' => \$seq_region_name,
'write!' => \$write,
'include_non_ref!' => \$include_non_ref,
'verbose!' => \$verbose, );
unless ($write) {
print
" you've not used the -write option so results will not be written into the database\n";
print "You have not used the -write option "
. "so results will not be written into the database\n";
}
my $db =
......@@ -51,52 +64,91 @@ my $db =
-user => $user,
-port => $port,
-dbname => $dbname,
-pass => $pass, );
-pass => $pass );
my $sa = $db->get_SliceAdaptor;
my $slices;
# Only update the canonical transcripts in this region.
if ($seq_region_name) {
print "Only updating genes on chromosome $seq_region_name\n";
my $slice =
$sa->fetch_by_region( $coord_system, $seq_region_name, $include_non_ref );
push( @$slices, $slice );
my $update_to_null = q(
UPDATE gene
SET canonical_transcript_id = NULL,
canonical_annotation = NULL
WHERE seq_region_id = ?);
my $seq_region_id = $slice->get_seq_region_id;
my $sth = $db->dbc->prepare($update_to_null);
$sth->bind_param(1, $seq_region_id, SQL_INTEGER);
$sth->execute();
} else {
$slices = $sa->fetch_all( $coord_system, '', $include_non_ref );
}
my $update_to_null = "update gene set canonical_transcript_id = NULL, canonical_annotation = NULL";
$db->dbc->do($update_to_null);
# get $db->dbc->db_handle->do($update_to_null) instead if above not working
my $update_to_null = q(update gene set canonical_transcript_id = NULL, canonical_annotation = NULL);
$db->dbc->do($update_to_null);
# get $db->dbc->db_handle->do($update_to_null) instead if above not working
}
my $gene_update_sql = "update gene set canonical_transcript_id = ? where gene_id = ?";
my $sth = $db->dbc->prepare($gene_update_sql);
my $gene_sth = $db->dbc->prepare($gene_update_sql);
SLICE: foreach my $slice (@$slices) {
print "Getting genes for " . $slice->name . "\n" if ($verbose);
my $genes = $slice->get_all_Genes( undef, undef, 1 );
my %canonical;
GENE: foreach my $gene (@$genes) {
print "Updating gene: ", $gene->dbID, "\n";
print "Looking at gene: ", $gene->dbID, "\n";
my $transcripts = $gene->get_all_Transcripts;
if ( @$transcripts == 1 ) {
$canonical{ $gene->dbID } = $transcripts->[0]->dbID;
next GENE;
}
# Keep track of the transcript biotypes
my %trans;
foreach my $transcript ( @{$transcripts} ) {
push @{ $trans{ $transcript->biotype } }, $transcript;
}
# If there is a single protein_coding transcript, make it the
# canonical transcript, if it has a functional translation.
foreach my $key ( keys(%trans) ) {
if ( ( $key eq 'protein_coding' ) && ( @{ $trans{$key} } == 1 ) ) {
my $trans_id = $trans{$key}->[0]->dbID;
unless ( $trans{$key}->[0]->translation->seq =~ /\*/ ) {
$canonical{ $gene->dbID } = $trans_id;
next GENE;
} else {
carp( "Transcript $trans_id has internal stop(s) "
. "and it shouldn't if it is protein coding\n" );
}
}
}
my $has_translation = 0;
my $count = 0;
my @with_translation;
my @no_translation;
foreach my $transcript (@$transcripts) {
if ( $transcript->translation
foreach my $transcript ( @{$transcripts} ) {
if ( ( $gene->biotype ne 'pseudogene' )
&& ( $gene->biotype ne 'processed_transcript' )
&& ( $gene->biotype ne 'pseudogene' )
&& ( $transcript->biotype ne 'nonsense_mediated_decay' )
&& ( $transcript->biotype ne 'processed_transcript' ) )
&& ( $transcript->biotype ne 'processed_transcript' )
&& $transcript->translation
&& ( $transcript->translation->seq !~ /\*/ ) )
{
unless ( $transcript->translation->seq =~ /\*/ ) {
push( @with_translation, $transcript );
}
push( @with_translation, $transcript );
} else {
push( @no_translation, $transcript );
}
......@@ -120,9 +172,11 @@ GENE: foreach my $gene (@$genes) {
}
$canonical{ $gene->dbID } = $sorted[0]->dbID;
} ## end foreach my $gene (@$genes)
foreach my $gene_id ( keys(%canonical) ) {
my $transcript_id = $canonical{$gene_id};
$sth->execute( $transcript_id, $gene_id ) if ($write);
print "Updating gene $gene_id\n";
$gene_sth->execute( $transcript_id, $gene_id ) if ($write);
}
} ## end foreach my $slice (@$slices)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment