SyntenyFramework.pm 16.3 KB
Newer Older
1
=head1 LICENSE
Patrick Meidl's avatar
Patrick Meidl committed
2

3
  Copyright (c) 1999-2013 The European Bioinformatics Institute and
4
  Genome Research Limited.  All rights reserved.
Patrick Meidl's avatar
Patrick Meidl committed
5

6 7
  This software is distributed under a modified Apache license.
  For license details, please see
Patrick Meidl's avatar
Patrick Meidl committed
8

9
    http://www.ensembl.org/info/about/code_licence.html
Patrick Meidl's avatar
Patrick Meidl committed
10

11
=head1 CONTACT
Patrick Meidl's avatar
Patrick Meidl committed
12

13
  Please email comments or questions to the public Ensembl
14
  developers list at <dev@ensembl.org>.
Patrick Meidl's avatar
Patrick Meidl committed
15

16 17
  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.
Patrick Meidl's avatar
Patrick Meidl committed
18

19
=cut
Patrick Meidl's avatar
Patrick Meidl committed
20

21
=head1 NAME
Patrick Meidl's avatar
Patrick Meidl committed
22

23 24
Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic
regions across the genome
Patrick Meidl's avatar
Patrick Meidl committed
25

26
=head1 SYNOPSIS
Patrick Meidl's avatar
Patrick Meidl committed
27

28 29 30 31 32 33 34 35 36
  # build the SyntenyFramework from unambiguous gene mappings
  my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
    -DUMP_PATH  => $dump_path,
    -CACHE_FILE => 'synteny_framework.ser',
    -LOGGER     => $self->logger,
    -CONF       => $self->conf,
    -CACHE      => $self->cache,
  );
  $sf->build_synteny($gene_mappings);
Patrick Meidl's avatar
Patrick Meidl committed
37

38 39
  # use it to rescore the genes
  $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
Patrick Meidl's avatar
Patrick Meidl committed
40

41
=head1 DESCRIPTION
Patrick Meidl's avatar
Patrick Meidl committed
42

43 44 45 46
The SyntenyFramework is a set of SyntenyRegions. These are pairs of
locations very analoguous to the information in the assembly table (the
locations dont have to be the same length though). They are built from
genes that map uniquely between source and target.
Patrick Meidl's avatar
Patrick Meidl committed
47

48 49 50 51
Once built, the SyntenyFramework is used to score source and target gene
pairs to determine whether they are similar. This process is slow (it
involves testing all gene pairs against all SyntenyRegions), this module
therefor has built-in support to run the process in parallel via LSF.
Patrick Meidl's avatar
Patrick Meidl committed
52

53
=head1 METHODS
Patrick Meidl's avatar
Patrick Meidl committed
54

55 56 57 58 59 60 61 62 63 64
  new
  build_synteny
  _by_overlap
  add_SyntenyRegion
  get_all_SyntenyRegions
  rescore_gene_matrix_lsf
  rescore_gene_matrix
  logger
  conf
  cache
Patrick Meidl's avatar
Patrick Meidl committed
65 66 67

=cut

68
package Bio::EnsEMBL::IdMapping::SyntenyFramework;
Patrick Meidl's avatar
Patrick Meidl committed
69 70 71 72 73

use strict;
use warnings;
no warnings 'uninitialized';

74 75
use Bio::EnsEMBL::IdMapping::Serialisable;
our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable);
Patrick Meidl's avatar
Patrick Meidl committed
76

77
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
Patrick Meidl's avatar
Patrick Meidl committed
78
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
79
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
Patrick Meidl's avatar
fixes  
Patrick Meidl committed
80
use Bio::EnsEMBL::IdMapping::SyntenyRegion;
81
use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
Patrick Meidl's avatar
Patrick Meidl committed
82

83 84
use FindBin qw($Bin);
FindBin->again;
Patrick Meidl's avatar
Patrick Meidl committed
85

Patrick Meidl's avatar
Patrick Meidl committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109

=head2 new

  Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
  Arg [CONF]  : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
  Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object
  Arg [DUMP_PATH] : String - path for object serialisation
  Arg [CACHE_FILE] : String - filename of serialised object
  Example     : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
                  -DUMP_PATH    => $dump_path,
                  -CACHE_FILE   => 'synteny_framework.ser',
                  -LOGGER       => $self->logger,
                  -CONF         => $self->conf,
                  -CACHE        => $self->cache,
                );
  Description : Constructor.
  Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework
  Exceptions  : thrown on wrong or missing arguments
  Caller      : InternalIdMapper plugins
  Status      : At Risk
              : under development

=cut

Patrick Meidl's avatar
Patrick Meidl committed
110 111 112 113 114
sub new {
  my $caller = shift;
  my $class = ref($caller) || $caller;
  my $self = $class->SUPER::new(@_);

115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
  my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);

  unless ($logger and ref($logger) and
          $logger->isa('Bio::EnsEMBL::Utils::Logger')) {
    throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
  }
  
  unless ($conf and ref($conf) and
          $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
    throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
  }
  
  unless ($cache and ref($cache) and
          $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
    throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
  }
  
  # initialise
  $self->logger($logger);
  $self->conf($conf);
  $self->cache($cache);
  $self->{'cache'} = [];
Patrick Meidl's avatar
Patrick Meidl committed
137 138 139 140 141

  return $self;
}


Patrick Meidl's avatar
Patrick Meidl committed
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
=head2 build_synteny

  Arg[1]      : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings
                to build the SyntenyFramework from
  Example     : $synteny_framework->build_synteny($gene_mappings);
  Description : Builds the SyntenyFramework from unambiguous gene mappings.
                SyntenyRegions are allowed to overlap. At most two overlapping
                SyntenyRegions are merged (otherwise we'd get too large
                SyntenyRegions with little information content).
  Return type : none
  Exceptions  : thrown on wrong or missing argument
  Caller      : InternalIdMapper plugins
  Status      : At Risk
              : under development

=cut

Patrick Meidl's avatar
Patrick Meidl committed
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
sub build_synteny {
  my $self = shift;
  my $mappings = shift;
  
  unless ($mappings and
          $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
    throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
  }

  # create a synteny region for each mapping
  my @synteny_regions = ();

  foreach my $entry (@{ $mappings->get_all_Entries }) {
    
    my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
      $entry->source);
    my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
      $entry->target);

    my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
      $source_gene->start,
      $source_gene->end,
      $source_gene->strand,
      $source_gene->seq_region_name,
      $target_gene->start,
      $target_gene->end,
      $target_gene->strand,
      $target_gene->seq_region_name,
      $entry->score,
    ]);

    push @synteny_regions, $sr;
  }

Patrick Meidl's avatar
Patrick Meidl committed
193 194 195 196 197
  unless (@synteny_regions) {
    $self->logger->warning("No synteny regions could be identified.\n");
    return;
  }

Patrick Meidl's avatar
Patrick Meidl committed
198
  # sort synteny regions
199 200 201 202 203
  #my @sorted = sort _by_overlap @synteny_regions;
  my @sorted = reverse sort {
    $a->source_seq_region_name cmp $b->source_seq_region_name ||
    $a->source_start <=> $b->source_start ||
    $a->source_end <=> $b->source_end } @synteny_regions;
Patrick Meidl's avatar
Patrick Meidl committed
204 205

  $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n");
Patrick Meidl's avatar
Patrick Meidl committed
206 207 208 209 210 211 212 213
  
  # now create merged regions from overlapping syntenies, but only merge a
  # maximum of 2 regions (otherwise you end up with large synteny blocks which
  # won't contain much information in this context)
  my $last_merged = 0;
  my $last_sr = shift(@sorted);

  while (my $sr = shift(@sorted)) {
214
    #$self->logger->debug("this ".$sr->to_string."\n");
Patrick Meidl's avatar
Patrick Meidl committed
215 216 217 218 219 220
  
    my $merged_sr = $last_sr->merge($sr);

    if (! $merged_sr) {
      unless ($last_merged) {
        $self->add_SyntenyRegion($last_sr->stretch(2));
221
        #$self->logger->debug("nnn  ".$last_sr->to_string."\n");
Patrick Meidl's avatar
Patrick Meidl committed
222
      }
223
      $last_merged = 0;
Patrick Meidl's avatar
Patrick Meidl committed
224 225
    } else {
      $self->add_SyntenyRegion($merged_sr->stretch(2));
226
      #$self->logger->debug("mmm  ".$merged_sr->to_string."\n");
Patrick Meidl's avatar
Patrick Meidl committed
227 228
      $last_merged = 1;
    }
229
    
Patrick Meidl's avatar
Patrick Meidl committed
230 231 232 233 234 235 236 237
    $last_sr = $sr;
  }

  # deal with last synteny region in @sorted
  unless ($last_merged) {
    $self->add_SyntenyRegion($last_sr->stretch(2));
    $last_merged = 0;
  }
238 239 240 241

  #foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
  #  $self->logger->debug("SRs ".$sr->to_string."\n");
  #}
Patrick Meidl's avatar
Patrick Meidl committed
242 243 244 245 246 247
  
  $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n");

}


Patrick Meidl's avatar
Patrick Meidl committed
248 249 250
#
# sort SyntenyRegions by overlap
#
Patrick Meidl's avatar
Patrick Meidl committed
251 252
sub _by_overlap {
  # first sort by seq_region
253
  my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name);
Patrick Meidl's avatar
Patrick Meidl committed
254
  return $retval if ($retval);
Patrick Meidl's avatar
Patrick Meidl committed
255

Patrick Meidl's avatar
Patrick Meidl committed
256 257
  # then sort by overlap:
  # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap
258 259
  if ($a->source_end < $b->source_start) { return 1; }
  if ($a->source_start < $b->source_end) { return -1; }
Patrick Meidl's avatar
Patrick Meidl committed
260
  return 0;
Patrick Meidl's avatar
Patrick Meidl committed
261 262 263
}


Patrick Meidl's avatar
Patrick Meidl committed
264 265 266 267 268 269 270 271 272 273 274 275 276 277
=head2 add_SyntenyRegion

  Arg[1]      : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add
  Example     : $synteny_framework->add_SyntenyRegion($synteny_region);
  Description : Adds a SyntenyRegion to the framework. For speed reasons (and
                since this is an internal method), no argument check is done.
  Return type : none
  Exceptions  : none
  Caller      : internal
  Status      : At Risk
              : under development

=cut

Patrick Meidl's avatar
Patrick Meidl committed
278
sub add_SyntenyRegion {
279
  push @{ $_[0]->{'cache'} }, $_[1];
Patrick Meidl's avatar
Patrick Meidl committed
280 281 282
}


Patrick Meidl's avatar
Patrick Meidl committed
283 284 285 286 287 288 289 290 291 292 293 294 295 296
=head2 get_all_SyntenyRegions

  Example     : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) {
                  # do something with the SyntenyRegion
                }
  Description : Get a list of all SyntenyRegions in the framework.
  Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion
  Exceptions  : none
  Caller      : general
  Status      : At Risk
              : under development

=cut

Patrick Meidl's avatar
Patrick Meidl committed
297
sub get_all_SyntenyRegions {
298 299 300 301
  return $_[0]->{'cache'};
}


Patrick Meidl's avatar
Patrick Meidl committed
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
=head2 rescore_gene_matrix_lsf

  Arg[1]      : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
                scores to rescore
  Example     : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
  Description : This method runs rescore_gene_matrix() (via the
                synteny_resocre.pl script) in parallel with lsf, then combines
                the results to return a single rescored scoring matrix.
                Parallelisation is done by chunking the scoring matrix into
                several pieces (determined by the --synteny_rescore_jobs
                configuration option).
  Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
  Exceptions  : thrown on wrong or missing argument
                thrown on filesystem I/O error
                thrown on failure of one or mor lsf jobs
  Caller      : InternalIdMapper plugins
  Status      : At Risk
              : under development

=cut

323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
sub rescore_gene_matrix_lsf {
  my $self = shift;
  my $matrix = shift;

  unless ($matrix and
          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

  # serialise SyntenyFramework to disk
  $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped');
  $self->write_to_file;
  $self->logger->debug("Done.\n", 0, 'stamped');

  # split the ScoredMappingMatrix into chunks and write to disk
  my $matrix_size = $matrix->size;
  $self->logger->debug("Scores before rescoring: $matrix_size.\n");

  my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20;
  $num_jobs++;

344
  my $dump_path = path_append($self->conf->param('basedir'),
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
    'matrix/synteny_rescore');

  $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped');
  foreach my $i (1..$num_jobs) {
    my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1;
    my $end = int($matrix_size/($num_jobs-1)) * $i;
    $self->logger->debug("$start-$end\n", 1);
    my $sub_matrix = $matrix->sub_matrix($start, $end);
    
    $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser");
    $sub_matrix->dump_path($dump_path);
    
    $sub_matrix->write_to_file;
  }
  $self->logger->debug("Done.\n", 0, 'stamped');

  # create an empty lsf log directory
Patrick Meidl's avatar
Patrick Meidl committed
362
  my $logpath = path_append($self->logger->logpath, 'synteny_rescore');
363 364 365 366 367 368 369 370 371 372
  system("rm -rf $logpath") == 0 or
    $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
  system("mkdir -p $logpath") == 0 or
    $self->logger->error("Can't create lsf log dir $logpath: $!\n");

  # build lsf command
  my $lsf_name = 'idmapping_synteny_rescore_'.time;

  my $options = $self->conf->create_commandline_options(
      logauto       => 1,
Patrick Meidl's avatar
Patrick Meidl committed
373
      logautobase   => "synteny_rescore",
374 375 376 377 378
      logpath       => $logpath,
      interactive   => 0,
      is_component  => 1,
  );

379
  my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX};
380

381 382 383 384 385 386
  my $bsub_cmd =
    sprintf( "|bsub -J%s[1-%d] "
                            . "-o %s/synteny_rescore.%%I.out "
                            . "-e %s/synteny_rescore.%%I.err %s",
             $lsf_name, $num_jobs, $logpath, $logpath,
             $self->conf()->param('lsf_opt_synteny_rescore') );
387 388 389 390 391 392

  # run lsf job array
  $self->logger->info("Submitting $num_jobs jobs to lsf.\n");
  $self->logger->debug("$cmd\n\n");

  local *BSUB;
393
  open( BSUB, $bsub_cmd ) ## no critic
394
    or $self->logger->error("Could not open open pipe to bsub: $!\n");
395 396 397 398 399 400 401 402 403

  print BSUB $cmd;
  $self->logger->error("Error submitting synteny rescoring jobs: $!\n")
    unless ($? == 0); 
  close BSUB;

  # submit dependent job to monitor finishing of jobs
  $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped');

404 405
  my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
    qq{-o $logpath/synteny_rescore_depend.out /bin/true};
406 407 408 409 410 411 412 413 414 415

  system($dependent_job) == 0 or
    $self->logger->error("Error submitting dependent job: $!\n");

  $self->logger->info("All jobs finished.\n", 0, 'stamped');

  # check for lsf errors
  sleep(5);
  my $err;
  foreach my $i (1..$num_jobs) {
Patrick Meidl's avatar
Patrick Meidl committed
416
    $err++ unless (-e "$logpath/synteny_rescore.$i.success");
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
  }

  if ($err) {
    $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
  }

  # merge and return matrix
  $self->logger->debug("Merging rescored matrices...\n");
  $matrix->flush;

  foreach my $i (1..$num_jobs) {
    # read partial matrix created by lsf job from file
    my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
      -DUMP_PATH   => $dump_path,
      -CACHE_FILE  => "gene_matrix_synteny$i.ser",
    );
    $sub_matrix->read_from_file;

    # merge with main matrix
    $matrix->merge($sub_matrix);
  }

  $self->logger->debug("Done.\n");
  $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n");
  
  return $matrix;
Patrick Meidl's avatar
Patrick Meidl committed
443 444 445 446 447
}


# 
#
Patrick Meidl's avatar
Patrick Meidl committed
448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
=head2 rescore_gene_matrix

  Arg[1]      : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
                scores to rescore
  Example     : my $new_scores = $sf->rescore_gene_matrix($gene_scores);
  Description : Rescores a gene matrix. Retains 70% of old score and builds
                other 30% from the synteny match.
  Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
  Exceptions  : thrown on wrong or missing argument
  Caller      : InternalIdMapper plugins
  Status      : At Risk
              : under development

=cut

Patrick Meidl's avatar
Patrick Meidl committed
463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
sub rescore_gene_matrix {
  my $self = shift;
  my $matrix = shift;

  unless ($matrix and
          $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
    throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
  }

  my $retain_factor = 0.7;

  foreach my $entry (@{ $matrix->get_all_Entries }) {
    my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
      $entry->source);

    my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
      $entry->target);

    my $highest_score = 0;

    foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
      my $score = $sr->score_location_relationship($source_gene, $target_gene);
      $highest_score = $score if ($score > $highest_score);
    }

488 489 490
    #$self->logger->debug("highscore ".$entry->to_string." ".
    #  sprintf("%.6f\n", $highest_score));

Patrick Meidl's avatar
fixes  
Patrick Meidl committed
491 492
    $matrix->set_score($entry->source, $entry->target,
      ($entry->score * 0.7 + $highest_score * 0.3));
Patrick Meidl's avatar
Patrick Meidl committed
493 494
  }

495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
  return $matrix;
}


=head2 logger

  Arg[1]      : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set
  Example     : $object->logger->info("Starting ID mapping.\n");
  Description : Getter/setter for logger object
  Return type : Bio::EnsEMBL::Utils::Logger
  Exceptions  : none
  Caller      : constructor
  Status      : At Risk
              : under development

=cut

sub logger {
  my $self = shift;
  $self->{'_logger'} = shift if (@_);
  return $self->{'_logger'};
}


=head2 conf

  Arg[1]      : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration
                to set
523
  Example     : my $basedir = $object->conf->param('basedir');
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
  Description : Getter/setter for configuration object
  Return type : Bio::EnsEMBL::Utils::ConfParser
  Exceptions  : none
  Caller      : constructor
  Status      : At Risk
              : under development

=cut

sub conf {
  my $self = shift;
  $self->{'_conf'} = shift if (@_);
  return $self->{'_conf'};
}


=head2 cache

  Arg[1]      : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set
  Example     : $object->cache->read_from_file('source');
  Description : Getter/setter for cache object
  Return type : Bio::EnsEMBL::IdMapping::Cache
  Exceptions  : none
  Caller      : constructor
  Status      : At Risk
              : under development

=cut

sub cache {
  my $self = shift;
  $self->{'_cache'} = shift if (@_);
  return $self->{'_cache'};
Patrick Meidl's avatar
Patrick Meidl committed
557 558 559 560 561
}


1;