ExonScoreBuilder.pm 22.7 KB
Newer Older
1
=head1 LICENSE
2

3
  Copyright (c) 1999-2013 The European Bioinformatics Institute and
4
  Genome Research Limited.  All rights reserved.
5

6 7
  This software is distributed under a modified Apache license.
  For license details, please see
8

9
    http://www.ensembl.org/info/about/code_licence.html
10

11
=head1 CONTACT
12

13
  Please email comments or questions to the public Ensembl
14
  developers list at <dev@ensembl.org>.
15

16 17
  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.
18

19
=cut
20

21
=head1 NAME
22

23
=head1 SYNOPSIS
24

25
=head1 DESCRIPTION
26

27 28
Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
Java application.
29

30
=head1 METHODS
31 32 33

=cut

34
package Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
35 36 37 38 39 40 41 42 43 44

use strict;
use warnings;
no warnings 'uninitialized';

use Bio::EnsEMBL::IdMapping::ScoreBuilder;
our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder);

use Bio::EnsEMBL::Utils::Argument qw(rearrange);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
45
use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
46 47 48
use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;


49 50 51 52 53
#
# exon scoring is done in two steps:
# 1. map exons by overlap (if a common coord_system exists)
# 2. map remaining and poorly scoring exons using exonerate
#
54 55
sub score_exons {
  my $self = shift;
56

57
  $self->logger->info( "-- Scoring exons...\n\n", 0, 'stamped' );
58 59 60 61

  # score using overlaps, then exonerate
  my $matrix = $self->overlap_score;

62
  my $exonerate_matrix = $self->exonerate_score($matrix);
63

64 65 66 67 68
  # log stats before matrix merging
  $self->logger->info("\nOverlap scoring matrix:\n");
  $self->log_matrix_stats($matrix);
  $self->logger->info("\nExonerate scoring matrix:\n");
  $self->log_matrix_stats($exonerate_matrix);
69

70 71 72 73
  # merge matrices
  $self->logger->info( "\nMerging scoring matrices...\n", 0,
                       'stamped' );
  $matrix->merge($exonerate_matrix);
74

75
  $self->logger->info( "Done.\n\n", 0, 'stamped' );
76

Patrick Meidl's avatar
Patrick Meidl committed
77
  # debug logging
78 79
  if ( $self->logger->loglevel eq 'debug' ) {
    $matrix->log( 'exon', $self->conf->param('basedir') );
Patrick Meidl's avatar
Patrick Meidl committed
80 81
  }

82
  # log stats of combined matrix
83
  $self->logger->info("Combined scoring matrix:\n");
84
  $self->log_matrix_stats($matrix);
85

86
  $self->logger->info( "\nDone with exon scoring.\n\n", 0, 'stamped' );
87 88

  return $matrix;
89
} ## end sub score_exons
90 91 92 93 94 95 96


#
# direct mapping by overlap (if common coord_system exists)
#
sub overlap_score {
  my $self = shift;
97

98 99 100 101 102 103 104 105
  my $dump_path =
    path_append( $self->conf->param('basedir'), 'matrix' );

  my $matrix =
    Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
                               -DUMP_PATH  => $dump_path,
                               -CACHE_FILE => 'exon_overlap_matrix.ser',
    );
106

107
  my $overlap_cache = $matrix->cache_file;
108

109 110
  if ( -s $overlap_cache ) {

111
    # read from file
112 113 114 115
    $self->logger->info(
                   "Reading exon overlap scoring matrix from file...\n",
                   0, 'stamped' );
    $self->logger->debug( "Cache file $overlap_cache.\n", 1 );
116
    $matrix->read_from_file;
117 118 119 120 121
    $self->logger->info( "Done.\n", 0, 'stamped' );

  }
  else {

122
    # build scoring matrix
123 124
    $self->logger->info(
         "No exon overlap scoring matrix found. Will build new one.\n");
125

126 127
    if ( $self->cache->highest_common_cs ) {
      $self->logger->info( "Overlap scoring...\n", 0, 'stamped' );
128
      $matrix = $self->build_overlap_scores($matrix);
129
      $self->logger->info( "Done.\n", 0, 'stamped' );
130 131
    }

132 133 134 135 136 137
    # write scoring matrix to file
    $matrix->write_to_file;

  }

  return $matrix;
138
} ## end sub overlap_score
139 140 141 142 143 144


#
# map the remaining exons using exonerate
#
sub exonerate_score {
145
  my $self   = shift;
146 147
  my $matrix = shift;

148 149 150
  unless ( $matrix and
          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
151 152 153
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

154 155
  my $dump_path =
    path_append( $self->conf->param('basedir'), 'matrix' );
156

157 158 159 160 161
  my $exonerate_matrix =
    Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
                             -DUMP_PATH  => $dump_path,
                             -CACHE_FILE => 'exon_exonerate_matrix.ser',
    );
162 163 164

  my $exonerate_cache = $exonerate_matrix->cache_file;

165
  if ( -s $exonerate_cache ) {
166 167

    # read from file
168 169 170
    $self->logger->info( "Reading exonerate matrix from file...\n",
                         0, 'stamped' );
    $self->logger->debug( "Cache file $exonerate_cache.\n", 1 );
171
    $exonerate_matrix->read_from_file;
172
    $self->logger->info( "Done.\n", 0, 'stamped' );
173

174 175
  }
  else {
176 177

    # build scoring matrix
178 179
    $self->logger->info(
                    "No exonerate matrix found. Will build new one.\n");
180 181

    # dump exons to fasta files
182
    my $dump_count = $self->dump_filtered_exons($matrix);
183

184
    if ($dump_count) {
185
      # run exonerate
186
      $self->run_exonerate;
187

188 189
      # parse results
      $self->parse_exonerate_results($exonerate_matrix);
190

191 192
    }
    else {
193

194 195
      $self->logger->info( "No source and/or target exons dumped, " .
                           "so don't need to run exonerate.\n" );
196

197
    }
198

199
    # write scoring matrix to file
200
    $exonerate_matrix->write_to_file;
201

202
  } ## end else [ if ( -s $exonerate_cache)]
203

204
  return $exonerate_matrix;
205
} ## end sub exonerate_score
206

207 208
#
# Algorithm:
209 210 211 212
# Get a lists of exon containers for source and target.  Walk along both
# lists, set a flag when you first encounter an exon (i.e. it starts).
# Record all alternative exons until you encounter the exon again (exon
# ends), then score against all alternative exons you've recorded.
213
#
214
sub build_overlap_scores {
215
  my ( $self, $matrix ) = @_;
216

217 218 219
  unless ($matrix
      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
220 221 222
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

223
  # get sorted list of exon containers
224 225
  $self->logger->info( "Reading sorted exons from cache...\n",
                       1, 'stamped' );
226

227 228 229 230 231 232 233 234 235
  my @source_exons =
    $self->sort_exons( [
        values %{ $self->cache->get_by_name( 'exons_by_id', 'source' ) }
      ] );

  my @target_exons =
    $self->sort_exons( [
        values %{ $self->cache->get_by_name( 'exons_by_id', 'target' ) }
      ] );
236

237
  $self->logger->info( "Done.\n", 1, 'stamped' );
238 239 240 241 242 243 244 245

  # get first source and target exon container
  my $source_ec = shift(@source_exons);
  my $target_ec = shift(@target_exons);

  my %source_overlap = ();
  my %target_overlap = ();

246
  $self->logger->info( "Scoring...\n", 1, 'stamped' );
247

248
  while ( defined($source_ec) || defined($target_ec) ) {
249 250 251 252
    my $add_source = 0;
    my $add_target = 0;

    # compare exon containers
253
    if ( defined($source_ec) && defined($target_ec) ) {
254 255 256
      my $cmp =
        $self->compare_exon_containers( $source_ec, $target_ec );
      if ( $cmp <= 0 ) { $add_source = 1 }
257
      if ( $cmp >= 0 ) { $add_target = 1 }
258
    }
259
    elsif ( defined($source_ec) ) {
260
      $add_source = 1;
261
    }
262
    elsif ( defined($target_ec) ) {
263
      $add_target = 1;
264 265
    }
    else {
266
      die('The world is a strange place');
267 268 269
    }

    if ($add_source) {
270
      if ( exists( $source_overlap{ $source_ec->[0] } ) ) {
271 272 273
        # remove exon from list of overlapping source exons to score
        # target against
        delete $source_overlap{ $source_ec->[0] };
274 275
      }
      else {
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
        # add exon to list of overlapping source exons to score target
        # against
        $source_overlap{ $source_ec->[0] } = $source_ec->[0];

        # score source exon against all target exons in current overlap
        # list
        foreach my $target_exon ( values %target_overlap ) {
          if ( defined( $matrix->get_score(
                                   $source_ec->[0]->id, $target_exon->id
                        ) ) )
          {
            next;
          }

          $self->calc_overlap_score( $source_ec->[0], $target_exon,
                                     $matrix );
292
        }
293
      }
294 295 296

      # get next source exon container
      $source_ec = shift(@source_exons);
297
    } ## end if ($add_source)
298 299

    if ($add_target) {
300
      if ( exists( $target_overlap{ $target_ec->[0] } ) ) {
301 302 303
        # remove exon from list of overlapping target exons to score
        # source against
        delete $target_overlap{ $target_ec->[0] };
304 305
      }
      else {
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
        # add exon to list of overlapping target exons to score source
        # against
        $target_overlap{ $target_ec->[0] } = $target_ec->[0];

        # score target exon against all source exons in current overlap
        # list
        foreach my $source_exon ( values %source_overlap ) {
          if ( defined( $matrix->get_score(
                                   $source_exon->id, $target_ec->[0]->id
                        ) ) )
          {
            next;
          }

          $self->calc_overlap_score( $source_exon, $target_ec->[0],
                                     $matrix );
322 323 324 325 326
        }
      }

      # get next target exon container
      $target_ec = shift(@target_exons);
327
    } ## end if ($add_target)
328
  } ## end while ( defined($source_ec...))
329

330
  $self->logger->info( "Done.\n", 1, 'stamped' );
331 332

  return $matrix;
333
} ## end sub build_overlap_scores
334 335 336


#
337 338 339 340
# Return a list of exon containers, sorted by seq_region_name, then location
# (where location is either start-1 or end, so each exon is in the list twice).
# An exon container is a listrefs of a TinyExon object and its location. This
# implements the ExonSortContainer in the java application.
341 342
#
sub sort_exons {
343
  my $self  = shift;
344 345
  my $exons = shift;

346
  ## no critic 
347 348 349 350 351
  return sort {
    ( $a->[0]->common_sr_name cmp $b->[0]->common_sr_name ) ||
      ( $a->[1] <=> $b->[1] )
    } ( map { [ $_, $_->common_start - 1 ] } @$exons ),
    ( map { [ $_, $_->common_end ] } @$exons );
352 353
}

354
sub compare_exon_containers {
355
  my $self = shift;
356 357
  my $e1   = shift;
  my $e2   = shift;
358

359 360
  return ( ( $e1->[0]->common_sr_name cmp $e2->[0]->common_sr_name ) ||
           ( $e1->[1] <=> $e2->[1] ) );
361 362
}

363
#
364 365 366
# Calculates overlap score between two exons. Its done by dividing
# overlap region by exons sizes. 1.0 is full overlap on both exons.
# Score of at least 0.5 are added to the exon scoring matrix.
367
#
368
sub calc_overlap_score {
369
  my $self        = shift;
370 371
  my $source_exon = shift;
  my $target_exon = shift;
372
  my $matrix      = shift;
373

374
  my ( $start, $end );
375 376

  # don't score if exons on different strand
377 378
  return unless ( $source_exon->strand == $target_exon->strand );

379
  # determine overlap start
380
  if ( $source_exon->start > $target_exon->start ) {
381
    $start = $source_exon->start;
382 383
  }
  else {
384 385
    $start = $target_exon->start;
  }
386

387
  # determine overlap end
388
  if ( $source_exon->end < $target_exon->end ) {
389
    $end = $source_exon->end;
390 391
  }
  else {
392 393 394 395
    $end = $target_exon->end;
  }

  #
396 397
  # Calculate score, which is defined as average overlap / exon length
  # ratio.
398
  #
399 400

  my $overlap       = $end - $start + 1;
401 402
  my $source_length = $source_exon->end - $source_exon->start + 1;
  my $target_length = $target_exon->end - $target_exon->start + 1;
403 404

  my $score = ( $overlap/$source_length + $overlap/$target_length )/2;
405

406 407 408 409 410 411 412 413 414 415 416
  # The following penalty was removed because it meant that an exon
  # whose sequence and position had not changed would become a
  # candidate for similarity mapping when its phase changed.  This
  # caused lots of criss-crossing stable ID history entries between
  # genes/transcripts/translations in some gene families.
  #
  ## PENALTY:
  ## penalise by 10% if phase if different
  #if ( $source_exon->phase != $target_exon->phase ) {
  #$score *= 0.9;
  #}
417 418

  # add score to scoring matrix if it's at least 0.5
419 420
  if ( $score >= 0.5 ) {
    $matrix->add_score( $source_exon->id, $target_exon->id, $score );
421
  }
422 423

} ## end sub calc_overlap_score
424 425 426 427 428 429 430


sub run_exonerate {
  my $self = shift;

  my $source_file = $self->exon_fasta_file('source');
  my $target_file = $self->exon_fasta_file('target');
431 432
  my $source_size = -s $source_file;
  my $target_size = -s $target_file;
433 434 435 436 437 438 439

  # check if fasta files exist and are not empty
  unless ($source_size and $target_size) {
    throw("Can't find exon fasta files.");
  }

  # create an empty lsf log directory
Patrick Meidl's avatar
Patrick Meidl committed
440
  my $logpath = path_append($self->logger->logpath, 'exonerate');
441
  system("rm -rf $logpath") == 0 or
442
    $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
443
  system("mkdir -p $logpath") == 0 or
444
    $self->logger->error("Can't create lsf log dir $logpath: $!\n");
445 446

  # delete exonerate output from previous runs
447
  my $dump_path = $self->cache->dump_path;
448

449 450
  opendir(DUMPDIR, $dump_path) or
    $self->logger->error("Can't open $dump_path for reading: $!");
451 452 453 454

  while (defined(my $file = readdir(DUMPDIR))) {
    next unless /exonerate_map\.\d+/;

455 456
    unlink("$dump_path/$file") or
      $self->logger->error("Can't delete $dump_path/$file: $!");
457 458 459 460 461
  }
  
  closedir(DUMPDIR);

  # determine number of jobs to split task into
462 463
  my $bytes_per_job = $self->conf->param('exonerate_bytes_per_job')
    || 250000;
464
  my $num_jobs = $self->conf->param('exonerate_jobs');
465 466 467 468 469
  $num_jobs ||= int( $source_size/$bytes_per_job + 1 );

  my $percent =
    int( ( $self->conf->param('exonerate_threshold') || 0.5 )*100 );
  my $lsf_name       = 'idmapping_exonerate_' . time;
470
  my $exonerate_path = $self->conf->param('exonerate_path');
471 472
  my $exonerate_extra_params =
    $self->conf->param('exonerate_extra_params');
473 474 475 476

  #
  # run exonerate jobs using lsf
  #
477
  my $exonerate_job =
478 479 480 481 482 483 484 485 486 487
    qq{$exonerate_path } .
    qq{--query $source_file --target $target_file } .
    q{--querychunkid $LSB_JOBINDEX } .
    qq{--querychunktotal $num_jobs } .
    q{--model ungapped -M 1000 -D 100 } .
    q{--showalignment FALSE --subopt no } . qq{--percent $percent } .
    $self->conf->param('exonerate_extra_params') . " " .
    q{--ryo 'myinfo: %qi %ti %et %ql %tl\n' } .
    qq{| grep '^myinfo:' > $dump_path/exonerate_map.\$LSB_JOBINDEX} .
    "\n";
488

Patrick Meidl's avatar
Patrick Meidl committed
489 490 491
  $self->logger->info("Submitting $num_jobs exonerate jobs to lsf.\n");
  $self->logger->debug("$exonerate_job\n\n");

492 493 494 495 496 497 498
  my $bsub_cmd = sprintf(
               "|bsub -J%s[1-%d]%%%d -o %s/exonerate.%%I.out %s",
               $lsf_name,
               $num_jobs,
               $self->conf()->param('exonerate_concurrent_jobs') || 200,
               $logpath,
               $self->conf()->param('lsf_opt_exonerate') );
499

Patrick Meidl's avatar
Patrick Meidl committed
500
  local *BSUB;
501
  open( BSUB, $bsub_cmd ) ## no critic
Patrick Meidl's avatar
Patrick Meidl committed
502
    or $self->logger->error("Could not open open pipe to bsub: $!\n");
503

504 505 506 507
  print BSUB $exonerate_job;
  $self->logger->error("Error submitting exonerate jobs: $!\n")
    unless ($? == 0); 
  close BSUB;
508

Patrick Meidl's avatar
Patrick Meidl committed
509
  # submit dependent job to monitor finishing of exonerate jobs
510
  $self->logger->info("Waiting for exonerate jobs to finish...\n", 0, 'stamped');
511

512 513
  my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
    qq{-o $logpath/exonerate_depend.out /bin/true};
514

515 516 517 518
  system($dependent_job) == 0 or
    $self->logger->error("Error submitting dependent job: $!\n");

  $self->logger->info("All exonerate jobs finished.\n", 0, 'stamped');
519 520 521 522 523 524 525 526 527 528

  #
  # check results
  #
  my @missing;
  my @error;
  
  for (my $i = 1; $i <= $num_jobs; $i++) {
  
    # check that output file exists
529
    my $outfile = "$dump_path/exonerate_map.$i";
530
    push @missing, $outfile unless (-f "$outfile");
531 532 533 534 535 536 537

    # check no errors occurred
    my $errfile = "$logpath/exonerate.$i.err";
    push @error, $errfile if (-s "$errfile");
  }

  if (@missing) {
538
    $self->logger->info("Couldn't find all exonerate output files. These are missing:\n");
539
    foreach (@missing) {
540
      $self->logger->info("$_\n", 1);
541 542 543 544 545 546
    }

    exit(1);
  }

  if (@error) {
547
    $self->logger->info("One or more exonerate jobs failed. Check these error files:\n");
548
    foreach (@error) {
549
      $self->logger->info("$_\n", 1);
550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
    }

    exit(1);
  }

}


sub exon_fasta_file {
  my $self = shift;
  my $type = shift;

  throw("You must provide a type.") unless $type;

  return $self->cache->dump_path."/$type.exons.fasta";
}


sub dump_filtered_exons {
  my $self = shift;
  my $matrix = shift;

  unless ($matrix and
          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('You must provide a ScoredMappingMatrix.');
  }

  # write exons to fasta files
  my $source_count = $self->write_filtered_exons('source', $matrix);
  my $target_count = $self->write_filtered_exons('target', $matrix);

  # return true if both source and target exons were written; otherwise we
  # don't need to run exonerate
  return (($source_count > 0) and ($target_count > 0));
}


sub write_filtered_exons {
  my $self = shift;
  my $type = shift;
  my $matrix = shift;

  throw("You must provide a type.") unless $type;
  unless ($matrix and
          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('You must provide a ScoredMappingMatrix.');
  }

598
  $self->logger->info("\nDumping $type exons to fasta file...\n", 0, 'stamped');
599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614

  # don't dump exons shorter than this
  my $min_exon_length = $self->conf->param('min_exon_length') || 15;

  # counters
  my $total_exons = 0;
  my $dumped_exons = 0;

  # filehandle for fasta files
  my $fh;
  my $file = $self->exon_fasta_file($type);
  open($fh, '>', $file) or throw("Unable to open $file for writing: $!");

  # loop over exons, dump sequence to fasta file if longer than threshold and
  # score < 1
  EXON:
Patrick Meidl's avatar
Patrick Meidl committed
615 616 617 618
  foreach my $eid (sort { $b <=> $a }
                   keys %{ $self->cache->get_by_name('exons_by_id', $type) }) {

    my $exon = $self->cache->get_by_key('exons_by_id', $type, $eid);
619 620 621 622

    $total_exons++;

    # skip if exon shorter than threshold
Patrick Meidl's avatar
Patrick Meidl committed
623
    next EXON if ($exon->length < $min_exon_length);
624 625

    # skip if overlap score with any other exon is 1
626 627 628 629 630 631
    if ( $type eq 'source' ) {
      foreach my $target ( @{ $matrix->get_targets_for_source($eid) } )
      {
        if ( $matrix->get_score( $eid, $target ) > 0.9999 ) {
          next EXON;
        }
632 633
      }
    } else {
634 635 636 637 638
      foreach my $source ( @{ $matrix->get_sources_for_target($eid) } )
      {
        if ( $matrix->get_score( $source, $eid ) > 0.9999 ) {
          next EXON;
        }
639 640 641 642
      }
    }

    # write exon to fasta file
Patrick Meidl's avatar
Patrick Meidl committed
643
    print $fh '>', $eid, "\n", $exon->seq, "\n";
644 645 646 647 648 649 650 651 652

    $dumped_exons++;

  }

  close($fh);

  # log
  my $fmt = "%-30s%10s\n";
653 654 655 656 657
  my $size = -s $file;
  $self->logger->info(sprintf($fmt, 'Total exons:', $total_exons), 1);
  $self->logger->info(sprintf($fmt, 'Dumped exons:', $dumped_exons), 1);
  $self->logger->info(sprintf($fmt, 'Dump file size:', parse_bytes($size)), 1);
  $self->logger->info("Done.\n\n", 0, 'stamped');
658 659 660 661 662

  return $dumped_exons;
}

sub parse_exonerate_results {
663
  my ( $self, $exonerate_matrix ) = @_;
664

665 666 667
  unless ( defined($exonerate_matrix)
           &&
           $exonerate_matrix->isa(
668 669 670
                         'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')
    )
  {
671 672 673
    throw('You must provide a ScoredMappingMatrix.');
  }

674
  $self->logger->info( "Parsing exonerate results...\n", 0, 'stamped' );
675 676

  # loop over all result files
677
  my $dump_path = $self->cache->dump_path;
678
  my $num_files = 0;
679
  my $num_lines = 0;
680

681 682
  opendir( DUMPDIR, $dump_path ) or
    $self->logger->error("Can't open $dump_path for reading: $!");
683

684
  my $penalised = 0;
685
  my $killed    = 0;
686

687 688
  while ( defined( my $file = readdir(DUMPDIR) ) ) {
    unless ( $file =~ /exonerate_map\.\d+/ ) { next }
689 690 691

    $num_files++;

692
    open( my $fh, '<', "$dump_path/$file" ); 
693

694 695
    my $threshold = $self->conf->param('exonerate_threshold') || 0.5;

696
    while (<$fh>) {
697 698 699
      $num_lines++;
      chomp;

700 701
  # line format:
  # myinfo: source_id target_id match_length source_length target_length
702 703 704
      my ( undef, $source_id, $target_id, $match_length, $source_length,
           $target_length )
        = split;
705 706 707

      my $score = 0;

708 709 710
      if ( $source_length == 0 or $target_length == 0 ) {
        $self->logger->warning(
               "Alignment length is 0 for $source_id or $target_id.\n");
711 712
      }
      else {
713 714
        $score = 2*$match_length/( $source_length + $target_length );

715 716 717
      }

      if ( $score > $threshold ) {
718 719 720 721 722 723 724 725 726 727 728 729
        my $source_sr =
          $self->cache()
          ->get_by_key( 'exons_by_id', 'source', $source_id )
          ->seq_region_name();
        my $target_sr =
          $self->cache()
          ->get_by_key( 'exons_by_id', 'target', $target_id )
          ->seq_region_name();

        if ( $source_sr ne $target_sr ) {
          # PENALTY: The target and source are not on the same
          # seq_region.
730 731 732 733
          $score *= 0.70;

          # With a penalty of 0.7, exonerate scores need to be above
          # approximately 0.714 to make the 0.5 threshold.
734 735

          ++$penalised;
736
        }
737

738 739 740 741
        if ( $score > $threshold ) {
          $exonerate_matrix->add_score( $source_id, $target_id,
                                        $score );
        }
742 743 744 745
        else {
          ++$killed;
        }
      } ## end if ( $score > $threshold)
746

747
    } ## end while (<F>)
748 749

    close(F);
750 751
  } ## end while ( defined( my $file...))

752 753
  closedir(DUMPDIR);

754 755 756
  $self->logger->info(
        "Done parsing $num_lines lines from $num_files result files.\n",
        0, 'stamped' );
757 758 759
  $self->logger->info( "Penalised $penalised exon alignments " .
                         "for not being on the same seq_region " .
                         "($killed killed).\n",
760 761
                       0,
                       'stamped' );
762 763

  return $exonerate_matrix;
764
} ## end sub parse_exonerate_results
765 766


767
sub non_mapped_transcript_rescore {
768 769
  my $self                = shift;
  my $matrix              = shift;
770 771 772
  my $transcript_mappings = shift;

  # argument checks
773 774 775 776 777
  unless ($matrix
      and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
  {
    throw(
       'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
778 779
  }

780 781 782 783 784 785
  unless ( $transcript_mappings
     and
     $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') )
  {
    throw(
         'Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
786 787
  }

788
  # create a lookup hash of mapped source transcripts to target
789 790 791
  # transcripts
  my %transcript_lookup =
    map { $_->source => $_->target }
792 793 794 795
    @{ $transcript_mappings->get_all_Entries };

  my $i = 0;

796
  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
797

798 799 800 801 802 803
    my @source_transcripts = @{
      $self->cache->get_by_key( 'transcripts_by_exon_id', 'source',
                                $entry->source ) };
    my @target_transcripts = @{
      $self->cache->get_by_key( 'transcripts_by_exon_id', 'target',
                                $entry->target ) };
804

805 806
    my $found_mapped = 0;

807
  TR:
808 809
    foreach my $source_tr (@source_transcripts) {
      foreach my $target_tr (@target_transcripts) {
810
        my $mapped_target = $transcript_lookup{ $source_tr->id };
811

812
        if ( $mapped_target && $mapped_target == $target_tr->id ) {
813 814 815 816 817
          $found_mapped = 1;
          last TR;
        }
      }
    }
818

819
    unless ($found_mapped) {
820 821
      # PENALTY: The exon appears to belong to a transcript that has not
      # been mapped.
822
      $matrix->set_score( $entry->source(), $entry->target(),
823
                          0.9*$entry->score() );
824 825
      $i++;
    }
826
  } ## end foreach my $entry ( @{ $matrix...})
827

828 829 830
  $self->logger->debug( "Scored exons in non-mapped transcripts: $i\n",
                        1 );
} ## end sub non_mapped_transcript_rescore
831

832

833

834
1;