select_canonical_transcripts.pl 10 KB
Newer Older
1
#!/usr/bin/env perl
Magali Ruffier's avatar
Magali Ruffier committed
2
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Tiago Grego's avatar
Tiago Grego committed
3
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
4 5 6 7 8 9 10 11 12 13 14 15 16
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Dan Bolser's avatar
Dan Bolser committed
17 18
# Script that selects the best candidate for canonical transcripts on
# each gene.
19

20 21 22 23 24 25
# For usage instructions, run ./select_canonical_transcripts.pl -help


use strict;
use warnings;

26
use Getopt::Long;
Dan Bolser's avatar
Dan Bolser committed
27
use IO::File;
28

29 30
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::TranscriptSelector;
31
use Bio::EnsEMBL::DBSQL::DBAdaptor;
32
use Bio::EnsEMBL::Registry;
33

Dan Bolser's avatar
Dan Bolser committed
34
my ($host, $port, $dbname, $user, $pass);
35 36
my ($dnahost, $dnaport, $dnadbname, $dnauser, $dnapass);
my ($ccds_host, $ccds_dbname, $ccds_user, $ccds_port, $ccds_pass);
Dan Bolser's avatar
Dan Bolser committed
37
my ($log_path, $help);
38

39 40
Bio::EnsEMBL::Registry->no_cache_warnings(1);

41 42
my $coord_system_name;
my $seq_region_name;
Dan Bolser's avatar
Dan Bolser committed
43 44 45 46

# keep as undefined unless you only want to run on a specific analysis
my $logic_name;

47 48 49 50 51 52 53 54 55 56
my $write = 0;
my $include_non_ref = 1;
my $include_duplicates;
my $verbose = 0;

GetOptions( 'dbhost:s'            => \$host,
            'dbport:n'            => \$port,
            'dbname:s'            => \$dbname,
            'dbuser:s'            => \$user,
            'dbpass:s'            => \$pass,
57
            'dnadbhost:s'         => \$dnahost,
58
            'dnadbname:s'         => \$dnadbname,
59 60 61
            'dnadbuser:s'         => \$dnauser,
            'dnadbpass:s'         => \$dnapass,
            'dnadbport:s'         => \$dnaport,
62 63 64
            'ccdshost:s'          => \$ccds_host,
            'ccdsdbname:s'        => \$ccds_dbname,
            'ccdsuser:s'          => \$ccds_user,
65
            'ccdsport:s'          => \$ccds_port,
66
            'ccdspass:s'          => \$ccds_pass,
Dan Bolser's avatar
Dan Bolser committed
67

68 69
            'coord_system_name:s' => \$coord_system_name,
            'seq_region_name:s'   => \$seq_region_name,
Dan Bolser's avatar
Dan Bolser committed
70

71 72 73 74 75
            'logic_name:s'        => \$logic_name,
            'write!'              => \$write,
            'include_non_ref!'    => \$include_non_ref,
            'include_duplicates'  => \$include_duplicates,
            'verbose!'            => \$verbose, 
Dan Bolser's avatar
Dan Bolser committed
76 77 78

            # log file used for analysing choices in bulk
            'log:s'               => \$log_path,
79
            'help!'               => \$help,
Dan Bolser's avatar
Dan Bolser committed
80 81 82 83
            'h!'                  => \$help
    ) or die "check options\n";

if ($help) { &usage; exit; }
84 85

unless ($write) {
86
  print "You have not used the -write option "
87
      . "so results will not be written into the database\n";
88 89
}

90
my $dba =
Dan Bolser's avatar
Dan Bolser committed
91 92 93 94 95 96 97 98 99 100
  new Bio::EnsEMBL::DBSQL::DBAdaptor(
      -host   => $host,
      -user   => $user,
      -port   => $port,
      -dbname => $dbname,
      -pass   => $pass,
      -species => 'default',
      -no_cache => 1,
    );

101 102
if($dnadbname) {
  if(!$dnauser || !$dnahost) {
Dan Bolser's avatar
Dan Bolser committed
103 104
    throw ("You must provide user, host and dbname details ".
           "to connect to DNA DB!");
105
  }
Dan Bolser's avatar
Dan Bolser committed
106 107 108 109 110 111 112 113 114
  my $dna_db =
      new Bio::EnsEMBL::DBSQL::DBAdaptor(
          -host   => $dnahost,
          -user       => $dnauser,
          -port       => $dnaport,
          -dbname     => $dnadbname,
          -pass       => $dnapass,
          -species    => 'dna_'.$dba->species(),
      );
115 116 117 118 119
  $dba->dnadb($dna_db);
}
else {
  my $dna = check_if_DB_contains_DNA($dba);
  if(!$dna) {
Dan Bolser's avatar
Dan Bolser committed
120 121
    throw ("Your gene DB contains no DNA. ".
           "You must provide a DNA_DB to connect to");
122 123 124
  }
}

125 126 127 128
my $ccds_dba;

if ($ccds_dbname) {
  if (!$ccds_user || !$ccds_host) {
Dan Bolser's avatar
Dan Bolser committed
129 130
    throw ("You must provide user, host and dbname details ".
           "to connect to CCDS DB!");
131
  }
Dan Bolser's avatar
Dan Bolser committed
132 133 134 135 136 137 138 139 140 141
  $ccds_dba =
  new Bio::EnsEMBL::DBSQL::DBAdaptor(
      -host   => $ccds_host,
      -user   => $ccds_user,
      -port   => $ccds_port,
      -pass   => $ccds_pass,
      -dbname => $ccds_dbname,
      -species => 'ccds_'.$dba->species(),
      -no_cache => 1,
  );
142
}
143 144 145

my $log_fh;
if ($log_path) {
Dan Bolser's avatar
Dan Bolser committed
146 147 148
    $log_fh = IO::File->new($log_path,"w")
        or throw ("Could not open logging file.");
}
149

Dan Bolser's avatar
Dan Bolser committed
150 151
my $transcript_selector =
    Bio::EnsEMBL::Utils::TranscriptSelector->new($ccds_dba);
152 153 154

my $slice_adaptor = $dba->get_SliceAdaptor;
my $slices;
Dan Bolser's avatar
Dan Bolser committed
155

156
if ($seq_region_name) {
Dan Bolser's avatar
Dan Bolser committed
157 158 159 160 161 162 163 164 165
    $slices = [
        $slice_adaptor->
          fetch_by_region(
              $coord_system_name,
              $seq_region_name,
              $include_non_ref,
              $include_duplicates
          )
        ];
166
}
Dan Bolser's avatar
Dan Bolser committed
167 168 169 170 171 172 173 174 175 176 177 178
else {
    if (!$coord_system_name) {
        throw 'Requires a coordinate system name to function in this mode';
    }
    $slices = $slice_adaptor->
        fetch_all($coord_system_name,
                  '',
                  $include_non_ref,
                  $include_duplicates
        );
}

179 180
my $canonical_changes = 0;
my $total_genes = 0;
181

182 183 184
my @change_list;

foreach my $slice (@$slices) {
Dan Bolser's avatar
Dan Bolser committed
185 186 187
    my $genes = $slice->
        get_all_Genes($logic_name, undef, 1);

188 189
    while (my $gene = shift @$genes) {
        $total_genes++;
Dan Bolser's avatar
Dan Bolser committed
190 191 192
        my $new_canonical = $transcript_selector->
            select_canonical_transcript_for_Gene($gene);

193
        my $old_canonical = $gene->canonical_transcript;
Dan Bolser's avatar
Dan Bolser committed
194 195

        if ( !defined $old_canonical ) {
196 197
            # Original canonical transcript is now absent, or never set.
            if ($log_fh) {
198
                print $log_fh "//\n";
199
                print $log_fh "Old=[undef,undef,undef,undef,undef,undef,undef]\n";
Dan Bolser's avatar
Dan Bolser committed
200 201
                printf $log_fh "New=[%s,%s,%s,%s,%s,%s,'%s']\n",
                  @{ $transcript_selector->encode_transcript($new_canonical) };
202
            }
Dan Bolser's avatar
Dan Bolser committed
203
            push @change_list, [ $gene->dbID, $new_canonical->dbID ];
204
            $canonical_changes++;
Dan Bolser's avatar
Dan Bolser committed
205 206 207
        }

        elsif ($new_canonical->dbID != $old_canonical->dbID) {
208
            no warnings 'uninitialized';
Dan Bolser's avatar
Dan Bolser committed
209 210 211 212 213 214 215 216 217 218
            printf
                "%s (%s) changed transcript from %s (%s) to %s (%s)\n",
                $gene->stable_id,
                $gene->dbID,
                $old_canonical->stable_id,
                $old_canonical->dbID,
                $new_canonical->stable_id,
                $new_canonical->dbID;

            push @change_list, [ $gene->dbID, $new_canonical->dbID ];
219
            $canonical_changes++;
Dan Bolser's avatar
Dan Bolser committed
220

221 222 223 224 225 226
            if ($verbose) {
                printf "Old transcript: [%s,%s,%s,%s,%s,%s,%s]\n",
                    @{ $transcript_selector->encode_transcript($old_canonical) };
                printf "New transcript: [%s,%s,%s,%s,%s,%s,%s]\n",
                    @{ $transcript_selector->encode_transcript($new_canonical) };
            }
Dan Bolser's avatar
Dan Bolser committed
227

228
            if ($log_fh) {
Dan Bolser's avatar
Dan Bolser committed
229 230 231 232 233
                print $log_fh "//\n";
                printf $log_fh "Old=[%s,%s,%s,%s,%s,%s,'%s']\n",
                  @{ $transcript_selector->encode_transcript($old_canonical) };
                printf $log_fh "New=[%s,%s,%s,%s,%s,%s,'%s']\n",
                  @{ $transcript_selector->encode_transcript($new_canonical) };
234 235 236
            }
        }
    }
237
}
238

Dan Bolser's avatar
Dan Bolser committed
239 240 241 242
print
    "Canonical transcript alterations: $canonical_changes ".
    "from $total_genes genes\n";
if ($log_fh) {$log_fh->close}
243 244 245



Dan Bolser's avatar
Dan Bolser committed
246
## Change database entries.
247 248 249
if ($write) {
    my $gene_update_sql = "UPDATE gene SET canonical_transcript_id = ? where gene_id = ?";
    my $gene_sth = $dba->dbc->prepare($gene_update_sql);
Dan Bolser's avatar
Dan Bolser committed
250

251 252
    print "Updating database with new canonical transcripts...\n";
    foreach my $change (@change_list) {
Dan Bolser's avatar
Dan Bolser committed
253
        print "Changin' ". $change->[1]. " on ". $change->[0]."\n" if $verbose;
254 255 256
        $gene_sth->execute( $change->[1], $change->[0]);
    }
    print "Done\n";
257 258
}

Dan Bolser's avatar
Dan Bolser committed
259 260 261 262 263
print "Done\n";



## Subroutines
264

265
sub check_if_DB_contains_DNA {
Dan Bolser's avatar
Dan Bolser committed
266 267 268 269
  my $dba = shift;
  my $sql_command = "SELECT COUNT(*) FROM dna";

  my $sth = $dba->dbc->prepare($sql_command);
270
  $sth->execute();
Dan Bolser's avatar
Dan Bolser committed
271

272
  my @dna_array = $sth->fetchrow_array;
Dan Bolser's avatar
Dan Bolser committed
273

274
  if ( $dna_array[0] > 0 ) {
Dan Bolser's avatar
Dan Bolser committed
275 276 277 278
    print
      "Your DB ". $dba->dbc->dbname. " contains DNA sequences. ".
      "No need to attach a DNA_DB to it.\n"
        if $verbose;
279
    return 1;
Dan Bolser's avatar
Dan Bolser committed
280 281 282 283 284
  }
  else {
    print
      "Your DB " . $dba->dbc->dbname . " does not contain DNA sequences.\n"
        if $verbose;
285 286 287 288
    return 0;
  }
}

289

Dan Bolser's avatar
Dan Bolser committed
290
## POD anyone?
291

292
sub usage {
Dan Bolser's avatar
Dan Bolser committed
293 294
print <<EOS

295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
Example usage: perl set_canonical_transcripts.pl -dbhost host -dbuser user 
     -dbpass *** -dbname dbname -dbport 3306 -coord_system toplevel -write

Script options:

    -dbname       Database name

    -dbhost       Database host

    -dbport       Database port

    -dbuser       Database user

    -dbpass       Database password

Optional DB connection arguments:

    -dnadbname    DNA Database name

    -dnadbhost    DNA Database host

    -dnadbuser    DNA Database user
317 318 319 320
    
    -dnadbport    DNA Database port
    
    -dnadbpass    DNA Database pass
321 322 323 324 325 326

    -ccdsdbname  CCDS database name

    -ccdshost    CCDS database host

    -ccdsuser    CCDS database user
327
    
328 329
    -ccdspass    CCDS database pass
    
330
    -ccdsport    CCDS database port
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347

Other optional arguments:

    -coord_system_name    Coordinate system to use

    -include_non_ref      Specify if the non_reference regions should 
                          be _excluded_. (default: include) 

    -include_duplicates   Specify if the duplicate regions should be 
                          _included_. eg. Human PAR on Y (default: exclude) 

    -seq_region_name      Chromosome name if running a single seq_region

    -write                Specify if results should be written to the database

    -verbose              Increase verbosity of output messages

348
    -log                  Dump decision matrices into a log file for analysis
349 350 351 352 353

To check the script has run correctly you can run the
CoreForeignKeys healthcheck:

./run-healthcheck.sh -d dbname -output problem CoreForeignKeys
354

Dan Bolser's avatar
Dan Bolser committed
355 356 357 358 359 360
A warning about not using CCDS is perfectly acceptible when not
running on Human, Mouse and Zebrafish.

EOS
;

361
}