TranscriptScoreBuilder.pm 15.8 KB
Newer Older
1
=head1 LICENSE
2

3
  Copyright (c) 1999-2013 The European Bioinformatics Institute and
4
  Genome Research Limited.  All rights reserved.
5

6 7
  This software is distributed under a modified Apache license.
  For license details, please see
8

9
    http://www.ensembl.org/info/about/code_licence.html
10

11
=head1 CONTACT
12

13
  Please email comments or questions to the public Ensembl
14
  developers list at <dev@ensembl.org>.
15

16 17
  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.
18

19
=cut
20

21
=head1 NAME
22 23


24
=head1 SYNOPSIS
25 26


27
=head1 DESCRIPTION
28

29 30
Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
Java application.
31

32
=head1 METHODS
33 34 35

=cut

36
package Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
37 38 39 40 41 42 43 44 45

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::IdMapping::ScoreBuilder;
our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder);

use Bio::EnsEMBL::Utils::Exception qw(throw warning);
46
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
47 48 49 50 51 52 53 54 55 56 57 58
use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;


sub score_transcripts {
  my $self = shift;
  my $exon_matrix = shift;

  unless ($exon_matrix and
          $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

Patrick Meidl's avatar
Patrick Meidl committed
59
  $self->logger->info("-- Scoring transcripts...\n\n", 0, 'stamped');
60 61 62 63

  # build scores based on exon scores
  my $matrix = $self->scores_from_exon_scores($exon_matrix);

Patrick Meidl's avatar
Patrick Meidl committed
64 65
  # debug logging
  if ($self->logger->loglevel eq 'debug') {
66
    $matrix->log('transcript', $self->conf->param('basedir'));
Patrick Meidl's avatar
Patrick Meidl committed
67 68
  }

69 70 71
  # log stats of combined matrix
  my $fmt = "%-40s%10.0f\n";

72
  $self->logger->info("Scoring matrix:\n");
73

74 75
  $self->logger->info(sprintf($fmt, "Total source transcripts:",
    $self->cache->get_count_by_name('transcripts_by_id', 'source')), 1);
76

77 78
  $self->logger->info(sprintf($fmt, "Total target transcripts:",
    $self->cache->get_count_by_name('transcripts_by_id', 'target')), 1);
79 80 81

  $self->log_matrix_stats($matrix);
  
82
  $self->logger->info("\nDone with transcript scoring.\n\n");
83 84 85 86 87 88 89 90 91 92 93 94 95 96

  return $matrix;
}


sub scores_from_exon_scores {
  my $self = shift;
  my $exon_matrix = shift;

  unless ($exon_matrix and
          $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }
  
97
  my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
98
  
99
  my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
100
    -DUMP_PATH   => $dump_path,
101 102 103 104 105 106 107 108
    -CACHE_FILE  => 'transcript_matrix.ser',
  );

  my $transcript_cache = $matrix->cache_file;

  if (-s $transcript_cache) {
    
    # read from file
109
    $self->logger->info("Reading transcript scoring matrix from file...\n", 0, 'stamped');
Patrick Meidl's avatar
Patrick Meidl committed
110
    $self->logger->debug("Cache file $transcript_cache.\n", 1);
111
    $matrix->read_from_file;
Patrick Meidl's avatar
Patrick Meidl committed
112
    $self->logger->info("Done.\n\n", 0, 'stamped');
113 114 115 116
    
  } else {
    
    # build scoring matrix
117
    $self->logger->info("No transcript scoring matrix found. Will build new one.\n");
118

119
    $self->logger->info("Transcript scoring...\n", 0, 'stamped');
120
    $matrix = $self->build_scores($matrix, $exon_matrix);
Patrick Meidl's avatar
Patrick Meidl committed
121
    $self->logger->info("Done.\n\n", 0, 'stamped');
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151

    # write scoring matrix to file
    $matrix->write_to_file;

  }

  return $matrix;
}


sub build_scores {
  my $self = shift;
  my $matrix = shift;
  my $exon_matrix = shift;

  unless ($matrix and
          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }
  
  unless ($exon_matrix and
          $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need a exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

  # first find out which source and target transcripts have scoring exons and
  # build a "flag" matrix for these transcripts (all scores are 1)
  $self->flag_matrix_from_exon_scores($matrix, $exon_matrix);

  # now calculate the actual scores for the transcripts in the flag matrix
152 153
  my $final_matrix =
    $self->score_matrix_from_flag_matrix($matrix, $exon_matrix);
154 155 156 157 158 159 160 161 162 163 164 165 166 167
  
  return $final_matrix;
}


sub flag_matrix_from_exon_scores {
  my $self = shift;
  my $matrix = shift;
  my $exon_matrix = shift;

  # initialise progress logger
  my $i;
  my $num_transcripts =
    scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') });
Patrick Meidl's avatar
Patrick Meidl committed
168
  my $progress_id = $self->logger->init_progress($num_transcripts, 100);
169

170
  $self->logger->info("Creating flag matrix...\n", 1);
171 172 173 174 175

  # loop over source transcripts
  foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) {
    
    # log progress
Patrick Meidl's avatar
Patrick Meidl committed
176
    $self->logger->log_progress($progress_id, ++$i, 1);
177 178

    # get all exons for the source transcript
179
    foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
180 181 182 183 184 185 186 187

      # get target exons for this source exon from scoring matrix
      foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {

        # get target transcripts that contain this exon
        foreach my $target_transcript (@{ $self->cache->get_by_key('transcripts_by_exon_id', 'target', $target_exon_id) }) {
          
          # add scoring flag for these two transcripts
188
          $matrix->add_score($source_transcript->id, $target_transcript->id, 1);
189 190 191 192 193 194
          
        }
      }
    }
  }

Patrick Meidl's avatar
Patrick Meidl committed
195
  $self->logger->info("\n");
196 197 198 199 200 201 202 203 204 205

  return $matrix;
}


sub score_matrix_from_flag_matrix {
  my $self = shift;
  my $flag_matrix = shift;
  my $exon_matrix = shift;

Patrick Meidl's avatar
Patrick Meidl committed
206 207 208 209 210 211 212 213 214 215
  unless ($flag_matrix and
          $flag_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }
  
  unless ($exon_matrix and
          $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need an exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

216 217 218 219 220
  my $transcript_score_threshold =
    $self->conf->param('transcript_score_threshold') || 0;

  # create a new scoring matrix which will replace the flag matrix
  my $matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
Patrick Meidl's avatar
Patrick Meidl committed
221
    -DUMP_PATH   => $flag_matrix->dump_path,
222
    -CACHE_FILE  => $flag_matrix->cache_file_name,
223 224 225 226 227 228
  );

  # initialise progress logger
  my $i;
  my $num_transcripts =
    scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') });
Patrick Meidl's avatar
Patrick Meidl committed
229
  my $progress_id = $self->logger->init_progress($num_transcripts, 100);
230

231
  $self->logger->info("Creating score matrix from flag matrix...\n", 1);
Patrick Meidl's avatar
Patrick Meidl committed
232 233 234 235

  # debug
  my $fmt_d1 = "%-14s%-15s%-14s%-14s%-14s\n";
  my $fmt_d2 = "%.6f";
236 237 238 239 240
  
  # loop over source transcripts
  foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) {
    
    # log progress
Patrick Meidl's avatar
Patrick Meidl committed
241
    $self->logger->log_progress($progress_id, ++$i, 1);
242 243

    # We are only interested in scoring with exons that are in the target
Patrick Meidl's avatar
Patrick Meidl committed
244
    # transcript. The ScoredMappingMatrix may contain scores for exons that
245 246 247 248 249 250 251 252 253
    # aren't in this transcript so create a hash of the target transcript's
    # exons
    my %source_exons = map { $_->id => 1 }
      @{ $source_transcript->get_all_Exons };

    my $source_transcript_length = $source_transcript->length;

    # get all corresponding target transcripts from the flag matrix
    foreach my $target_transcript_id (@{ $flag_matrix->get_targets_for_source($source_transcript->id) }) {
Patrick Meidl's avatar
Patrick Meidl committed
254
      
255 256 257 258 259 260 261 262 263 264 265 266 267 268
      my $target_transcript = $self->cache->get_by_key('transcripts_by_id', 'target', $target_transcript_id);

      my $source_transcript_score = 0;
      my $target_transcript_score = 0;
      my $target_transcript_length = $target_transcript->length;

      my %target_exons = map { $_->id => 1 }
        @{ $target_transcript->get_all_Exons };

      # now loop over source exons and find the highest scoring target exon
      # belonging to the target transcript
      
      foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {

Patrick Meidl's avatar
Patrick Meidl committed
269 270
        my $max_source_score = -1;
        
271 272
        foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {

273
          next unless ($target_exons{$target_exon_id});
274

Patrick Meidl's avatar
Patrick Meidl committed
275 276
          my $score = $exon_matrix->get_score($source_exon->id,
            $target_exon_id);
277 278 279 280
          $max_source_score = $score if ($score > $max_source_score);
        }

        if ($max_source_score > 0) {
Patrick Meidl's avatar
Patrick Meidl committed
281 282
          $source_transcript_score += ($max_source_score * $source_exon->length);

283 284 285 286 287 288 289
        }
      }

      # now do the same for target exons
      
      foreach my $target_exon (@{ $target_transcript->get_all_Exons }) {

Patrick Meidl's avatar
Patrick Meidl committed
290 291
        my $max_target_score = -1;
        
292 293
        foreach my $source_exon_id (@{ $exon_matrix->get_sources_for_target($target_exon->id) }) {

294
          next unless ($source_exons{$source_exon_id});
295 296 297 298 299 300 301

          my $score = $exon_matrix->get_score(
            $source_exon_id, $target_exon->id);
          $max_target_score = $score if ($score > $max_target_score);
        }

        if ($max_target_score > 0) {
Patrick Meidl's avatar
Patrick Meidl committed
302 303
          $target_transcript_score += ($max_target_score * $target_exon->length);

304 305 306 307 308 309 310 311 312 313 314 315
        }
      }

      #
      # calculate transcript score and add to scoring matrix
      #
      if (($source_transcript_length + $target_transcript_length) > 0) {

        # sanity check
        if (($source_transcript_score > $source_transcript_length) or
            ($target_transcript_score > $target_transcript_length)) {

316
          $self->logger->warning("Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1);
317 318

        } else {
Patrick Meidl's avatar
Patrick Meidl committed
319

320 321 322
          my $source_transcript_biotype_group = $self->get_biotype_group($source_transcript->biotype());
          my $target_transcript_biotype_group = $self->get_biotype_group($target_transcript->biotype());

Patrick Meidl's avatar
Patrick Meidl committed
323 324 325 326
          # debug
          $self->logger->info($source_transcript->id.":".$target_transcript->id.
            " source score: $source_transcript_score".
            " source length: $source_transcript_length".
327 328
            " source biotype:" . $source_transcript->biotype() .
            " source group: $source_transcript_biotype_group".
Patrick Meidl's avatar
Patrick Meidl committed
329
            " target score: $target_transcript_score".
330 331
            " target biotype:" . $target_transcript->biotype() .
            " target group: $target_transcript_biotype_group".
Patrick Meidl's avatar
Patrick Meidl committed
332
            " target length: $target_transcript_length\n");
333

334 335 336 337 338
          
          my $transcript_score =
            ($source_transcript_score + $target_transcript_score) /
            ($source_transcript_length + $target_transcript_length);

339 340 341 342 343 344 345 346 347 348 349
## Add penalty if biotypes are different
          if ($source_transcript->biotype() ne $target_transcript->biotype()) {
            $transcript_score = $transcript_score * 0.9;
          }

## Add penalty if biotype groups are different
          if ($source_transcript_biotype_group ne $target_transcript_biotype_group) {
            $transcript_score = $transcript_score * 0.8;
          }

          # everything is fine, add score to matrix
350 351 352 353 354 355 356 357 358
          if ($transcript_score > $transcript_score_threshold) {
            $matrix->add_score($source_transcript->id, $target_transcript->id,
              $transcript_score);
          }
          
        }
      
      } else {
      
359
        $self->logger->warning("Combined length of source (".$source_transcript->id.") and target (".$target_transcript->id.") transcript is zero!\n", 1);
360 361 362 363 364 365
      
      }

    }
  }

Patrick Meidl's avatar
Patrick Meidl committed
366
  $self->logger->info("\n");
367 368 369 370 371 372

  return $matrix;
    
}


373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
#
# penalise scores between genes with different biotypes.
# entries are modified in place
#
sub biotype_transcript_rescore {
  my ( $self, $matrix ) = @_;

  if ( defined($matrix) &&
       !$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
    throw('Exprected Bio::EnsEMBL::IdMapping::ScoredMappingMatrix');
  }

  my $i = 0;

  foreach my $entry ( @{ $matrix->get_all_Entries } ) {

    my $source_transcript =
      $self->cache->get_by_key( 'transcripts_by_id', 'source',
                                $entry->source() );

    my $target_transcript =
      $self->cache->get_by_key( 'transcripts_by_id', 'target',
                                $entry->target() );

    if ($source_transcript->biotype() ne $target_transcript->biotype() )
    {
400
      # PENALTY: Lower the score for mappings to transcripts of
401 402
      # different biotype.
      $matrix->set_score( $entry->source(), $entry->target(),
403
                          0.9*$entry->score() );
404 405 406 407 408 409 410 411 412
      $i++;
    }
  }

  $self->logger->debug("Scored transcripts with biotype mismatch: $i\n",
                       1 );
} ## end sub biotype_transcript_rescore


413
sub different_translation_rescore {
414
  my $self   = shift;
415 416
  my $matrix = shift;

417 418 419
  unless ($matrix
      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
420 421 422 423 424
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

  my $i = 0;

425 426 427
  foreach my $entry ( sort { $b->score <=> $a->score }
                      @{ $matrix->get_all_Entries } )
  {
428 429

    # we only do this for perfect matches, i.e. transcript score == 1
430
    last if ( $entry->score < 1 );
431

432 433 434 435 436 437
    my $source_tl =
      $self->cache->get_by_key( 'transcripts_by_id', 'source',
                                $entry->source )->translation;
    my $target_tl =
      $self->cache->get_by_key( 'transcripts_by_id', 'target',
                                $entry->target )->translation;
438 439

    # no penalty if both transcripts have no translation
440 441 442 443 444 445 446 447 448
    next if ( !$source_tl and !$target_tl );

    if (    !$source_tl
         or !$target_tl
         or ( $source_tl->seq ne $target_tl->seq ) )
    {
      # PENALTY: The transcript stable ID is now on a transcript with a
      # different translation.
      $matrix->set_score( $entry->source(), $entry->target(),
449
                          0.9*$entry->score() );
450 451 452
      $i++;
    }

453 454 455 456 457 458
  } ## end foreach my $entry ( sort { ...})

  $self->logger->debug(
                "Non-perfect translations on perfect transcripts: $i\n",
                1 );
} ## end sub different_translation_rescore
459 460 461


sub non_mapped_gene_rescore {
462 463
  my $self          = shift;
  my $matrix        = shift;
464 465 466
  my $gene_mappings = shift;

  # argument checks
467 468 469 470 471 472
  unless ($matrix
      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
    throw(
       'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
    );
473 474
  }

475 476 477
  unless ( $gene_mappings
       and $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') )
  {
478 479 480 481
    throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
  }

  # create of lookup hash of mapped source genes to target genes
482 483
  my %gene_lookup =
    map { $_->source => $_->target }
484 485 486 487
    @{ $gene_mappings->get_all_Entries };

  my $i = 0;

488
  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
489

490 491 492 493 494 495
    my $source_gene =
      $self->cache->get_by_key( 'genes_by_transcript_id', 'source',
                                $entry->source );
    my $target_gene =
      $self->cache->get_by_key( 'genes_by_transcript_id', 'target',
                                $entry->target );
496

497
    my $mapped_target = $gene_lookup{ $source_gene->id };
498

499
    if ( !$mapped_target or ( $mapped_target != $target_gene->id ) ) {
500 501
      # PENALTY: The transcript stable ID has been mapped to an
      # un-mapped gene.
502
      $matrix->set_score( $entry->source(), $entry->target(),
503
                          0.9*$entry->score() );
504 505 506 507
      $i++;
    }
  }

508 509 510
  $self->logger->debug( "Scored transcripts in non-mapped genes: $i\n",
                        1 );
} ## end sub non_mapped_gene_rescore
511

512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528

sub get_biotype_group {
  my ($self, $biotype) = @_;
  my $dba = $self->cache->get_production_DBAdaptor();
  my $helper   = $self->cache->get_production_DBAdaptor()->dbc()->sql_helper();

  my $sql      = q{
     SELECT biotype_group
     FROM biotype
     WHERE object_type = 'transcript'
     AND is_current = 1
     AND name = ?
     AND db_type like '%core%' };
  my $result = $helper->execute_simple(-SQL => $sql, -PARAMS => [$biotype]);
  return $result->[0];
}

529
  
530 531
1;