AssemblyMapperAdaptor.pm 56.9 KB
Newer Older
1
=head1 LICENSE
2

3
Copyright [1999-2013] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4

5 6 7 8 9 10 11 12 13 14 15 16 17
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut
18

19 20 21 22

=head1 CONTACT

  Please email comments or questions to the public Ensembl
Magali Ruffier's avatar
Magali Ruffier committed
23
  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
24 25

  Questions may also be sent to the Ensembl help desk at
Magali Ruffier's avatar
Magali Ruffier committed
26
  <http://www.ensembl.org/Help/Contact>.
27 28 29 30 31 32

=cut

=head1 NAME

Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
33 34 35

=head1 SYNOPSIS

36
  use Bio::EnsEMBL::Registry;
37

38
  Bio::EnsEMBL::Registry->load_registry_from_db(
39 40
    -host => 'ensembldb.ensembl.org',
    -user => 'anonymous'
41 42
  );

43 44 45 46 47
  $asma = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
    "assemblymapper" );

  $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
    "coordsystem" );
48

49 50 51 52
  my $chr33_cs = $csa->fetch_by_name( 'chromosome', 'NCBI33' );
  my $chr34_cs = $csa->fetch_by_name( 'chromosome', 'NCBI34' );
  my $ctg_cs   = $csa->fetch_by_name('contig');
  my $clone_cs = $csa->fetch_by_name('clone');
53 54

  my $chr_ctg_mapper =
55
    $asma->fetch_by_CoordSystems( $chr33_cs, $ctg_cs );
56 57

  my $ncbi33_ncbi34_mapper =
58
    $asm_adptr->fetch_by_CoordSystems( $chr33, $chr34 );
59 60

  my $ctg_clone_mapper =
61
    $asm_adptr->fetch_by_CoordSystems( $ctg_cs, $clone_cs );
62

63 64 65

=head1 DESCRIPTION

66 67
Adaptor for handling Assembly mappers.  This is a I<Singleton> class.
ie: There is only one per database (C<DBAdaptor>).
68

69 70 71 72 73 74 75
This is used to retrieve mappers between any two coordinate systems
whose makeup is described by the assembly table.  Currently one step
(explicit) and two step (implicit) pairwise mapping is supported.  In
one-step mapping an explicit relationship between the coordinate systems
is defined in the assembly table.  In two-step 'chained' mapping no
explicit mapping is present but the coordinate systems must share a
common mapping to an intermediate coordinate system.
76

77
=head1 METHODS
78 79 80 81 82 83 84 85 86

=cut

package Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor;
use vars qw(@ISA);
use strict;

use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use Bio::EnsEMBL::AssemblyMapper;
87
use Bio::EnsEMBL::ChainedAssemblyMapper;
88
use Bio::EnsEMBL::TopLevelAssemblyMapper;
89 90

use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache
91 92
use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump);
#use Bio::EnsEMBL::Utils::Exception qw(deprecate throw);
93
use Bio::EnsEMBL::Utils::SeqRegionCache;
94 95

use integer; #do proper arithmetic bitshifts
96 97 98

@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);

99 100 101

my $CHUNKFACTOR = 20;  # 2^20 = approx. 10^6

102 103
=head2 new

104 105
  Arg [1]    : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for
               the database this assembly mapper is using.
106 107 108 109 110
  Example    : my $asma = new Bio::EnsEMBL::AssemblyMapperAdaptor($dbadaptor);
  Description: Creates a new AssemblyMapperAdaptor object
  Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
  Exceptions : none
  Caller     : Bio::EnsEMBL::DBSQL::DBAdaptor
111
  Status     : Stable
112 113

=cut
114 115

sub new {
116
  my($class, $dbadaptor) = @_;
117

118
  my $self = $class->SUPER::new($dbadaptor);
119

120 121
  $self->{'_asm_mapper_cache'} = {};

122 123 124 125 126 127
  # use a shared cache (for this database) that contains info about
  # seq regions
  my $seq_region_cache = $self->db->get_SeqRegionCache();
  $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'};
  $self->{'sr_id_cache'}   = $seq_region_cache->{'id_cache'};

128 129 130 131
  return $self;
}


132

133 134 135
=head2  cache_seq_ids_with_mult_assemblys

  Example    : $self->adaptor->cache_seq_ids_with_mult_assemblys();
136
  Description: Creates a hash of the component seq region ids that
137 138 139 140 141
               map to more than one assembly from the assembly table.
  Retruntype : none
  Exceptions : none
  Caller     : AssemblyMapper, ChainedAssemblyMapper
  Status     : At Risk
142

143 144 145 146 147 148 149
=cut

sub cache_seq_ids_with_mult_assemblys{
  my $self = shift;
  my %multis;

  return if (defined($self->{'multi_seq_ids'}));
150

151 152
  $self->{'multi_seq_ids'} = {};

153 154 155 156 157 158 159 160 161 162 163
  my $sql = qq(
  SELECT    sra.seq_region_id
  FROM      seq_region_attrib sra,
            attrib_type at,
            seq_region sr,
            coord_system cs
  WHERE     sra.attrib_type_id = at.attrib_type_id
    AND     code = "MultAssem"
    AND     sra.seq_region_id = sr.seq_region_id
    AND     sr.coord_system_id = cs.coord_system_id
    AND     cs.species_id = ?);
164 165 166

  my $sth = $self->prepare($sql);

167 168
  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );

169 170
  $sth->execute();

171
  my ($seq_region_id);
172

173
  $sth->bind_columns(\$seq_region_id);
174 175

  while($sth->fetch()) {
176
    $self->{'multi_seq_ids'}->{$seq_region_id} = 1;
177 178 179
  }
  $sth->finish;
}
180 181


182
=head2 fetch_by_CoordSystems
183

184 185 186 187 188 189 190 191
  Arg [1]    : Bio::EnsEMBL::CoordSystem $cs1
               One of the coordinate systems to retrieve the mapper
               between
  Arg [2]    : Bio::EnsEMBL::CoordSystem $cs2
               The other coordinate system to map between
  Description: Retrieves an Assembly mapper for two coordinate
               systems whose relationship is described in the
               assembly table.
192

193 194
               The ordering of the coodinate systems is arbitrary.
               The following two statements are equivalent:
195 196
               $mapper = $asma->fetch_by_CoordSystems($cs1,$cs2);
               $mapper = $asma->fetch_by_CoordSystems($cs2,$cs1);
197
  Returntype : Bio::EnsEMBL::AssemblyMapper
198
  Exceptions : wrong argument types
199
  Caller     : general
200
  Status     : Stable
201 202 203

=cut

204 205 206 207 208 209 210
sub fetch_by_CoordSystems {
  my $self = shift;
  my $cs1  = shift;
  my $cs2  = shift;

  if(!ref($cs1) || !$cs1->isa('Bio::EnsEMBL::CoordSystem')) {
    throw("cs1 argument must be a Bio::EnsEMBL::CoordSystem.");
211
  }
212 213 214 215
  if(!ref($cs2) || !$cs2->isa('Bio::EnsEMBL::CoordSystem')) {
    throw("cs2 argument must be a Bio::EnsEMBL::CoordSystem.");
  }

216 217 218 219
#  if($cs1->equals($cs2)) {
#    throw("Cannot create mapper between same coord systems: " .
#          $cs1->name . " " . $cs1->version);
#  }
220

221 222 223 224 225 226 227
  if($cs1->is_top_level()) {
    return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2);
  }
  if($cs2->is_top_level()) {
    return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1);
  }

228 229 230 231 232 233
  my $csa = $self->db->get_CoordSystemAdaptor();

  #retrieve the shortest possible mapping path between these systems
  my @mapping_path = @{$csa->get_mapping_path($cs1,$cs2)};

  if(!@mapping_path) {
234 235 236 237 238 239 240 241 242

    # It is perfectly fine not to have a mapping. No warning needed really
    # Just check the return code!!

#    warning(
#      "There is no mapping defined between these coord systems:\n" .
#      $cs1->name() . " " . $cs1->version() . " and " . $cs2->name() . " " .
#      $cs2->version()
#    );
243
    return undef;
244 245
  }

246
  my $key = join(':', map({defined($_)?$_->dbID():"-"} @mapping_path));
247 248 249 250 251 252 253 254 255 256 257 258 259 260

  my $asm_mapper = $self->{'_asm_mapper_cache'}->{$key};

  return $asm_mapper if($asm_mapper);

  if(@mapping_path == 1) {
    throw("Incorrect mapping path defined in meta table. " .
	  "0 step mapping encountered between:\n" .
	  $cs1->name() . " " . $cs1->version() . " and " . $cs2->name . " " .
	  $cs2->version());
  }

  if(@mapping_path == 2) {
    #1 step regular mapping
261
    $asm_mapper = Bio::EnsEMBL::AssemblyMapper->new($self, @mapping_path);
262 263

#   If you want multiple pieces on two seqRegions to map to each other
264 265 266
#   you need to make an assembly.mapping entry that is seperated with a #
#   instead of an |.

267 268 269 270 271 272
    $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
    return $asm_mapper;
  }

  if(@mapping_path == 3) {
   #two step chained mapping
273
    $asm_mapper = Bio::EnsEMBL::ChainedAssemblyMapper->new($self,@mapping_path);
274 275 276 277 278 279
    #in multi-step mapping it is possible get requests with the
    #coordinate system ordering reversed since both mappings directions
    #cache on both orderings just in case
    #e.g.   chr <-> contig <-> clone   and   clone <-> contig <-> chr

    $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
280
    $key = join(':', map({defined($_)?$_->dbID():"-"} reverse(@mapping_path)));
281 282 283 284 285 286 287 288 289
    $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
    return $asm_mapper;
  }

  throw("Only 1 and 2 step coordinate system mapping is currently\n" .
	"supported.  Mapping between " .
          $cs1->name() . " " . $cs1->version() . " and " .
          $cs2->name() . " " . $cs2->version() .
          " requires ". (scalar(@mapping_path)-1) . " steps.");
290 291 292 293
}



294 295 296
=head2 register_assembled

  Arg [1]    : Bio::EnsEMBL::AssemblyMapper $asm_mapper
297 298 299
               A valid AssemblyMapper object
  Arg [2]    : integer $asm_seq_region
               The dbID of the seq_region to be registered
300 301 302 303 304
  Arg [3]    : int $asm_start
               The start of the region to be registered
  Arg [4]    : int $asm_end
               The end of the region to be registered
  Description: Declares an assembled region to the AssemblyMapper.
305
               This extracts the relevant data from the assembly
306
               table and stores it in Mapper internal to the $asm_mapper.
307 308
               It therefore must be called before any mapping is
               attempted on that region. Otherwise only gaps will
309 310
               be returned.  Note that the AssemblyMapper automatically
               calls this method when the need arises.
311
  Returntype : none
312 313 314 315
  Exceptions : throw if the seq_region to be registered does not exist
               or if it associated with multiple assembled pieces (bad data
               in assembly table)
  Caller     : Bio::EnsEMBL::AssemblyMapper
316
  Status     : Stable
317 318 319 320

=cut


321 322 323 324 325 326 327
sub register_assembled {
  my $self = shift;
  my $asm_mapper = shift;
  my $asm_seq_region = shift;
  my $asm_start      = shift;
  my $asm_end        = shift;

328 329 330 331 332 333 334 335
  if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
    throw("Bio::EnsEMBL::AssemblyMapper argument expected");
  }

  throw("asm_seq_region argument expected") if(!defined($asm_seq_region));
  throw("asm_start argument expected") if(!defined($asm_start));
  throw("asm_end argument expected") if(!defined($asm_end));

336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
  my $asm_cs_id = $asm_mapper->assembled_CoordSystem->dbID();
  my $cmp_cs_id = $asm_mapper->component_CoordSystem->dbID();

  #split up the region to be registered into fixed chunks
  #this allows us to keep track of regions that have already been
  #registered and also works under the assumption that if a small region
  #is requested it is likely that other requests will be made in the
  #vicinity (the minimum size registered the chunksize (2^chunkfactor)

  my @chunk_regions;
  #determine span of chunks
  #bitwise shift right is fast and easy integer division

  my($start_chunk, $end_chunk);

  $start_chunk = $asm_start >> $CHUNKFACTOR;
  $end_chunk   = $asm_end   >> $CHUNKFACTOR;

354 355 356 357 358 359
  # inserts have start = end + 1, on boundary condition start_chunk
  # could be less than end chunk
  if($asm_start == $asm_end + 1) {
    ($start_chunk, $end_chunk) = ($end_chunk, $start_chunk);
  }

360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
  #find regions of continuous unregistered chunks
  my $i;
  my ($begin_chunk_region,$end_chunk_region);
  for ($i = $start_chunk; $i <= $end_chunk; $i++) {
    if($asm_mapper->have_registered_assembled($asm_seq_region, $i)) {
      if(defined($begin_chunk_region)) {
        #this is the end of an unregistered region.
        my $region = [($begin_chunk_region   << $CHUNKFACTOR),
                      (($end_chunk_region+1)     << $CHUNKFACTOR)-1];
        push @chunk_regions, $region;
        $begin_chunk_region = $end_chunk_region = undef;
      }
    } else {
      $begin_chunk_region = $i if(!defined($begin_chunk_region));
      $end_chunk_region   = $i+1;
      $asm_mapper->register_assembled($asm_seq_region,$i);
    }
  }

  #the last part may have been an unregistered region too
  if(defined($begin_chunk_region)) {
    my $region = [($begin_chunk_region << $CHUNKFACTOR),
                  (($end_chunk_region+1)   << $CHUNKFACTOR) -1];
    push @chunk_regions, $region;
  }

  return if(!@chunk_regions);

  # keep the Mapper to a reasonable size
389
  if( $asm_mapper->size() > $asm_mapper->max_pair_count() ) {
390
    $asm_mapper->flush();
391
    #we now have to go and register the entire requested region since we
392
    #just flushed everything
393

394 395 396 397 398 399 400 401
    @chunk_regions = ( [ ( $start_chunk << $CHUNKFACTOR)
                         , (($end_chunk+1) << $CHUNKFACTOR)-1 ] );

    for( my $i = $start_chunk; $i <= $end_chunk; $i++ ) {
      $asm_mapper->register_assembled( $asm_seq_region, $i );
    }
  }

402
#  my $asm_seq_region_id =
403
#    $self->_seq_region_name_to_id($asm_seq_region,$asm_cs_id);
404 405 406 407 408 409 410 411 412 413 414

  # Retrieve the description of how the assembled region is made from
  # component regions for each of the continuous blocks of unregistered,
  # chunked regions

  my $q = qq{
      SELECT
         asm.cmp_start,
         asm.cmp_end,
         asm.cmp_seq_region_id,
         sr.name,
415
         sr.length,
416 417 418 419 420 421 422 423 424 425 426
         asm.ori,
         asm.asm_start,
         asm.asm_end
      FROM
         assembly asm, seq_region sr
      WHERE
         asm.asm_seq_region_id = ? AND
         ? <= asm.asm_end AND
         ? >= asm.asm_start AND
         asm.cmp_seq_region_id = sr.seq_region_id AND
	 sr.coord_system_id = ?
427
   };
428

429 430 431 432
  my $sth = $self->prepare($q);

  foreach my $region (@chunk_regions) {
    my($region_start, $region_end) = @$region;
433
    $sth->bind_param(1,$asm_seq_region,SQL_INTEGER);
434 435 436 437 438
    $sth->bind_param(2,$region_start,SQL_INTEGER);
    $sth->bind_param(3,$region_end,SQL_INTEGER);
    $sth->bind_param(4,$cmp_cs_id,SQL_INTEGER);

    $sth->execute();
439 440

    my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori,
441
      $asm_start, $asm_end, $cmp_seq_region_length);
442 443

    $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
444
                       \$cmp_seq_region, \$cmp_seq_region_length, \$ori,
445
                       \$asm_start, \$asm_end);
446 447 448 449 450

    #
    # Load the unregistered regions of the mapper
    #
    while($sth->fetch()) {
451
      next if($asm_mapper->have_registered_component($cmp_seq_region_id)
452
               and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region_id}));
453
      $asm_mapper->register_component($cmp_seq_region_id);
454 455 456
      $asm_mapper->mapper->add_map_coordinates(
                 $asm_seq_region, $asm_start, $asm_end,
                 $ori,
457
                 $cmp_seq_region_id, $cmp_start, $cmp_end);
458

459
      my $arr = [ $cmp_seq_region_id, $cmp_seq_region,
460 461 462 463
                  $cmp_cs_id, $cmp_seq_region_length ];

      $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
      $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
464
    }
465 466 467
  }

  $sth->finish();
468 469 470
}


471

472 473 474 475 476 477 478 479
sub _seq_region_name_to_id {
  my $self    = shift;
  my $sr_name = shift;
  my $cs_id   = shift;

  ($sr_name && $cs_id) || throw('seq_region_name and coord_system_id args ' .
				'are required');

480
  my $arr = $self->{'sr_name_cache'}->{"$sr_name:$cs_id"};
481 482 483
  if( $arr ) {
    return $arr->[0];
  }
484 485 486 487 488

  # Get the seq_region_id via the name.  This would be quicker if we just
  # used internal ids instead but stored but then we lose the ability
  # the transform accross databases with different internal ids

489
  my $sth = $self->prepare("SELECT seq_region_id, length " .
490 491 492
			   "FROM   seq_region " .
			   "WHERE  name = ? AND coord_system_id = ?");

493 494 495
  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
  $sth->bind_param(2,$cs_id,SQL_INTEGER);
  $sth->execute();
496 497 498 499 500 501

  if(!$sth->rows() == 1) {
    throw("Ambiguous or non-existant seq_region [$sr_name] " .
	  "in coord system $cs_id");
  }

502
  my ($sr_id, $sr_length) = $sth->fetchrow_array();
503 504
  $sth->finish();

505
  $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
506 507 508

  $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
  $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
509 510 511

  return $sr_id;
}
512

513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549
sub _seq_region_id_to_name {
  my $self    = shift;
  my $sr_id = shift;

  ($sr_id) || throw('seq_region_is required');

  my $arr = $self->{'sr_id_cache'}->{"$sr_id"};
  if( $arr ) {
    return $arr->[1];
  }

  # Get the seq_region name via the id.  This would be quicker if we just
  # used internal ids instead but stored but then we lose the ability
  # the transform accross databases with different internal ids

  my $sth = $self->prepare("SELECT name, length ,coord_system_id " .
			   "FROM   seq_region " .
			   "WHERE  seq_region_id = ? ");

  $sth->bind_param(1,$sr_id,SQL_INTEGER);
  $sth->execute();

  if(!$sth->rows() == 1) {
    throw("non-existant seq_region [$sr_id]");
  }

  my ($sr_name, $sr_length, $cs_id) = $sth->fetchrow_array();
  $sth->finish();

  $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];

  $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
  $self->{'sr_id_cache'}->{"$sr_id"} = $arr;

  return $sr_name;
}

550 551 552 553 554

=head2 register_component

  Arg [1]    : Bio::EnsEMBL::AssemblyMapper $asm_mapper
               A valid AssemblyMapper object
555 556
  Arg [2]    : integer $cmp_seq_region
               The dbID of the seq_region to be registered
557
  Description: Declares a component region to the AssemblyMapper.
558
               This extracts the relevant data from the assembly
559
               table and stores it in Mapper internal to the $asm_mapper.
560
               It therefore must be called before any mapping is
561 562 563 564 565 566 567 568
               attempted on that region. Otherwise only gaps will
               be returned.  Note that the AssemblyMapper automatically
               calls this method when the need arises.
  Returntype : none
  Exceptions : throw if the seq_region to be registered does not exist
               or if it associated with multiple assembled pieces (bad data
               in assembly table)
  Caller     : Bio::EnsEMBL::AssemblyMapper
569
  Status     : Stable
570 571 572 573 574 575 576 577

=cut

sub register_component {
  my $self = shift;
  my $asm_mapper = shift;
  my $cmp_seq_region = shift;

578 579 580 581 582 583 584 585
  if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
    throw("Bio::EnsEMBL::AssemblyMapper argument expected");
  }

  if(!defined($cmp_seq_region)) {
    throw("cmp_seq_region argument expected");
  }

586 587 588
  my $cmp_cs_id = $asm_mapper->component_CoordSystem()->dbID();
  my $asm_cs_id = $asm_mapper->assembled_CoordSystem()->dbID();

589
  #do nothing if this region is already registered or special case
590
  return if($asm_mapper->have_registered_component($cmp_seq_region)
591
  and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region}));
592

593 594
#  my $cmp_seq_region_id =
#    $self->_seq_region_name_to_id($cmp_seq_region, $cmp_cs_id);
595 596 597 598 599 600 601 602

  # Determine what part of the assembled region this component region makes up

  my $q = qq{
      SELECT
         asm.asm_start,
         asm.asm_end,
         asm.asm_seq_region_id,
603 604
         sr.name,
         sr.length
605 606 607 608 609 610 611 612 613
      FROM
         assembly asm, seq_region sr
      WHERE
         asm.cmp_seq_region_id = ? AND
         asm.asm_seq_region_id = sr.seq_region_id AND
         sr.coord_system_id = ?
   };

  my $sth = $self->prepare($q);
614
  $sth->bind_param(1,$cmp_seq_region,SQL_INTEGER);
615 616
  $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
  $sth->execute();
617 618 619

  if($sth->rows() == 0) {
    #this component is not used in the assembled part i.e. gap
620
    $asm_mapper->register_component($cmp_seq_region);
621
    $sth->finish();
622 623 624
    return;
  }

625 626 627 628
  #we do not currently support components mapping to multiple assembled
  # make sure that you've got the correct mapping in the meta-table :
  #   chromosome:EquCab2#contig ( use'#' for multiple mappings )
  #   chromosome:EquCab2|contig ( use '|' delimiter for 1-1 mappings )
Jan-hinnerk Vogel's avatar
 
Jan-hinnerk Vogel committed
629
  #
630
  if($sth->rows() != 1) {
631
    $sth->finish();
632
    throw("Multiple assembled regions for single " .
Jan-hinnerk Vogel's avatar
 
Jan-hinnerk Vogel committed
633 634 635
          "component region cmp_seq_region_id=[$cmp_seq_region]\n".
          "Remember that multiple mappings use the \#-operaator".
          " in the meta-table (i.e. chromosome:EquCab2\#contig\n");
636 637
  }

638 639
  my ($asm_start, $asm_end, $asm_seq_region_id,
      $asm_seq_region, $asm_seq_region_length) = $sth->fetchrow_array();
640

641 642 643 644 645
  my $arr = [ $asm_seq_region_id, $asm_seq_region,
              $asm_cs_id, $asm_seq_region_length ];

  $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
  $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
646 647 648 649 650 651 652 653 654

  $sth->finish();

  # Register the corresponding assembled region. This allows a us to
  # register things in assembled chunks which allows us to:
  # (1) Keep track of what assembled regions are registered
  # (2) Use locality of reference (if they want something in same general
  #     region it will already be registered).

655
  $self->register_assembled($asm_mapper,$asm_seq_region_id,$asm_start,$asm_end);
656 657 658 659
}



660
=head2 register_chained
661 662 663 664 665 666 667 668 669 670

  Arg [1]    : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
               The chained assembly mapper to register regions on
  Arg [2]    : string $from ('first' or 'last')
               The direction we are registering from, and the name of the
               internal mapper.
  Arg [3]    : string $seq_region_name
               The name of the seqregion we are registering on
  Arg [4]    : listref $ranges
               A list  of ranges to register (in [$start,$end] tuples).
671 672
  Arg [5]    : (optional) $to_slice
               Only register those on this Slice.
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
  Description: Registers a set of ranges on a chained assembly mapper.
               This function is at the heart of the chained mapping process.
               It retrieves information from the assembly table and
               dynamically constructs the mappings between two coordinate
               systems which are 2 mapping steps apart. It does this by using
               two internal mappers to load up a third mapper which is
               actually used by the ChainedAssemblyMapper to perform the
               mapping.

               This method must be called before any mapping is
               attempted on regions of interest, otherwise only gaps will
               be returned.  Note that the ChainedAssemblyMapper automatically
               calls this method when the need arises.
  Returntype : none
  Exceptions : throw if the seq_region to be registered does not exist
               or if it associated with multiple assembled pieces (bad data
               in assembly table)

               throw if the mapping between the coordinate systems cannot
               be performed in two steps, which means there is an internal
               error in the data in the meta table or in the code that creates
               the mapping paths.
  Caller     : Bio::EnsEMBL::AssemblyMapper
696
  Status     : Stable
697 698 699 700 701 702 703

=cut

sub register_chained {
  my $self = shift;
  my $casm_mapper = shift;
  my $from = shift;
704
  my $seq_region_id = shift;
705
  my $ranges = shift;
706
  my $to_slice = shift;
707

708 709
  my $to_seq_region_id;
  if(defined($to_slice)){
710 711 712
   if($casm_mapper->first_CoordSystem()->equals($casm_mapper->last_CoordSystem())){
      return $self->_register_chained_special($casm_mapper, $from, $seq_region_id, $ranges, $to_slice);
    }
713 714
    $to_seq_region_id = $to_slice->get_seq_region_id();
    if(!defined($to_seq_region_id)){
715
      die "Could not get seq_region_id for to_slice".$to_slice->seq_region_name."\n";
716 717
    }
  }
718

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744
  my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
      $end_name, $end_mid_mapper, $end_cs, $end_registry);

  if($from eq 'first') {
    $start_name       = 'first';
    $start_mid_mapper = $casm_mapper->first_middle_mapper();
    $start_cs         = $casm_mapper->first_CoordSystem();
    $start_registry   = $casm_mapper->first_registry();
    $end_mid_mapper   = $casm_mapper->last_middle_mapper();
    $end_cs           = $casm_mapper->last_CoordSystem();
    $end_registry     = $casm_mapper->last_registry();
    $end_name         = 'last';
  } elsif($from eq 'last') {
    $start_name       = 'last';
    $start_mid_mapper = $casm_mapper->last_middle_mapper();
    $start_cs         = $casm_mapper->last_CoordSystem();
    $start_registry   = $casm_mapper->last_registry();
    $end_mid_mapper   = $casm_mapper->first_middle_mapper();
    $end_cs           = $casm_mapper->first_CoordSystem();
    $end_registry     = $casm_mapper->first_registry();
    $end_name         = 'first';
  } else {
    throw("Invalid from argument: [$from], must be 'first' or 'last'");
  }

  my $combined_mapper = $casm_mapper->first_last_mapper();
745 746 747
  my $mid_cs          = $casm_mapper->middle_CoordSystem();
  my $mid_name        = 'middle';
  my $csa             = $self->db->get_CoordSystemAdaptor();
748 749 750 751 752 753

  # Check for the simple case where the ChainedMapper is short
  if( ! defined $mid_cs ) {
      $start_mid_mapper = $combined_mapper;
  }

754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783

  ##############
  # obtain the first half of the mappings and load them into the start mapper
  #

  #ascertain which is component and which is actually assembled coord system
  my @path;

  # check for the simple case, where the ChainedMapper is short
  if( defined $mid_cs ) {
      @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
  } else {
      @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
  }

  if(@path != 2 && defined( $path[1] )) {
    my $path = join(',', map({$_->name .' '. $_->version} @path));
    my $len  = scalar(@path) - 1;
    throw("Unexpected mapping path between start and intermediate " .
	  "coord systems (". $start_cs->name . " " . $start_cs->version .
	  " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
	  "\nExpected path length 1, got $len. " .
	  "(path=$path)");
  }

  my $sth;
  my ($asm_cs,$cmp_cs);
  $asm_cs = $path[0];
  $cmp_cs = $path[-1];

784 785
  #the SQL varies depending on whether we are coming from assembled or
  #component coordinate system
786 787 788

my $asm2cmp = (<<ASMCMP);
   SELECT
789 790 791 792
         asm.cmp_start,
         asm.cmp_end,
         asm.cmp_seq_region_id,
         sr.name,
793
         sr.length,
794 795 796 797 798 799 800 801 802 803
         asm.ori,
         asm.asm_start,
         asm.asm_end
      FROM
         assembly asm, seq_region sr
      WHERE
         asm.asm_seq_region_id = ? AND
         ? <= asm.asm_end AND
         ? >= asm.asm_start AND
         asm.cmp_seq_region_id = sr.seq_region_id AND
804 805 806
	 sr.coord_system_id = ?
ASMCMP

807

808 809
my $cmp2asm = (<<CMPASM);
   SELECT
810 811 812 813
         asm.asm_start,
         asm.asm_end,
         asm.asm_seq_region_id,
         sr.name,
814
         sr.length,
815 816 817 818 819 820 821 822 823 824
         asm.ori,
         asm.cmp_start,
         asm.cmp_end
      FROM
         assembly asm, seq_region sr
      WHERE
         asm.cmp_seq_region_id = ? AND
         ? <= asm.cmp_end AND
         ? >= asm.cmp_start AND
         asm.asm_seq_region_id = sr.seq_region_id AND
825 826 827 828 829 830 831
	 sr.coord_system_id = ?
CMPASM

  my $asm2cmp_sth;
  my $cmp2asm_sth;
  if(defined($to_slice)){
    my $to_cs = $to_slice->coord_system;
832 833
    if($asm_cs->equals($to_cs)){
      $asm2cmp_sth = $self->prepare($asm2cmp);
834 835
      $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
    }
836
    elsif($cmp_cs->equals($to_cs)){
837
      $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
838
      $cmp2asm_sth = $self->prepare($cmp2asm);
839 840 841 842 843 844 845 846 847
    }
    else{
      $asm2cmp_sth = $self->prepare($asm2cmp);
      $cmp2asm_sth = $self->prepare($cmp2asm);
    }
  }	
  else{
    $asm2cmp_sth = $self->prepare($asm2cmp);
    $cmp2asm_sth = $self->prepare($cmp2asm);
848
  }
849

850

851

852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
  $sth = ($asm_cs->equals($start_cs)) ? $asm2cmp_sth : $cmp2asm_sth;

  my $mid_cs_id;

  # check for the simple case where the ChainedMapper is short
  if( defined $mid_cs ) {
      $mid_cs_id = $mid_cs->dbID();
  } else {
      $mid_cs_id = $end_cs->dbID();
  }

  my @mid_ranges;
  my @start_ranges;

  #need to perform the query for each unregistered range
  foreach my $range (@$ranges) {
    my ($start, $end) = @$range;
869 870 871 872 873
    $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
    $sth->bind_param(2,$start,SQL_INTEGER);
    $sth->bind_param(3,$end,SQL_INTEGER);
    $sth->bind_param(4,$mid_cs_id,SQL_INTEGER);
    $sth->execute();
874 875 876 877

    #load the start <-> mid mapper with the results and record the mid cs
    #ranges we just added to the mapper

878
    my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
879 880 881
        $ori, $start_start, $start_end);

    $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
882
		       \$mid_seq_region, \$mid_length, \$ori, \$start_start,
883 884 885
		       \$start_end);

    while($sth->fetch()) {
886

887 888 889
      if( defined $mid_cs ) {
        $start_mid_mapper->add_map_coordinates
          (
890 891
           $seq_region_id,$start_start, $start_end, $ori,
           $mid_seq_region_id, $mid_start, $mid_end
892 893 894 895 896
          );
      } else {
        if( $from eq "first" ) {
          $combined_mapper->add_map_coordinates
            (
897 898
             $seq_region_id,$start_start, $start_end, $ori,
             $mid_seq_region_id, $mid_start, $mid_end
899 900 901 902
            );
        } else {
          $combined_mapper->add_map_coordinates
            (
903 904
             $mid_seq_region_id, $mid_start, $mid_end, $ori,
             $seq_region_id,$start_start, $start_end
905 906 907 908 909
            );
        }
      }

      #update sr_name cache
910 911 912 913 914
      my $arr = [ $mid_seq_region_id, $mid_seq_region,
                  $mid_cs_id, $mid_length ];

      $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
      $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
915 916 917

      push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
                        $mid_start,$mid_end];
918
      push @start_ranges, [ $seq_region_id, $start_start, $start_end ];
919 920 921

      #the region that we actually register may actually be larger or smaller
      #than the region that we wanted to register.
922
      #register the intersection of the region so we do not end up doing
923 924 925
      #extra work later

      if($start_start < $start || $start_end > $end) {
926
        $start_registry->check_and_register($seq_region_id,$start_start,
927 928 929
                                            $start_end);
      }
    }
930
    $sth->finish();
931 932 933 934 935 936
  }

  # in the one step case, we load the mid ranges in the
  # last_registry and we are done
  if( ! defined $mid_cs ) {
    for my $range ( @mid_ranges ) {
937
      $end_registry->check_and_register( $range->[0], $range->[2],
938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953
                                         $range->[3] );
    }

    # and thats it for the simple case ...
    return;
  }


  ###########
  # now the second half of the mapping
  # perform another query and load the mid <-> end mapper using the mid cs
  # ranges
  #

  #ascertain which is component and which is actually assembled coord system
  @path = @{$csa->get_mapping_path($mid_cs, $end_cs)};
954 955 956 957 958
  if(@path == 2 || ( @path == 3 && !defined $path[1])) {

    $asm_cs = $path[0];
    $cmp_cs = $path[-1];
  } else {
959 960 961 962 963 964 965 966 967
    my $path = join(',', map({$_->name .' '. $_->version} @path));
    my $len = scalar(@path)-1;
    throw("Unexpected mapping path between intermediate and last" .
	  "coord systems (". $mid_cs->name . " " . $mid_cs->version .
	  " and " . $end_cs->name . " " . $end_cs->version . ")." .
	  "\nExpected path length 1, got $len. " .
	  "(path=$path)");
  }

968 969
  if(defined($to_slice)){
    my $to_cs = $to_slice->coord_system;
970 971
    if($asm_cs->equals($to_cs)){
      $asm2cmp_sth = $self->prepare($asm2cmp);
972 973
      $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
    }
974
    elsif($cmp_cs->equals($to_cs)){
975
      $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
976
      $cmp2asm_sth = $self->prepare($cmp2asm);
977 978 979 980 981 982 983
    }
    else{
      $asm2cmp_sth = $self->prepare($asm2cmp);
      $cmp2asm_sth = $self->prepare($cmp2asm);
    }
  }

984 985 986 987 988
  $sth = ($asm_cs->equals($mid_cs)) ? $asm2cmp_sth : $cmp2asm_sth;

  my $end_cs_id = $end_cs->dbID();
  foreach my $mid_range (@mid_ranges) {
    my ($mid_seq_region_id, $mid_seq_region,$start, $end) = @$mid_range;
989 990 991 992 993
    $sth->bind_param(1,$mid_seq_region_id,SQL_INTEGER);
    $sth->bind_param(2,$start,SQL_INTEGER);
    $sth->bind_param(3,$end,SQL_INTEGER);
    $sth->bind_param(4,$end_cs_id,SQL_INTEGER);
    $sth->execute();
994 995 996 997

    #load the end <-> mid mapper with the results and record the mid cs
    #ranges we just added to the mapper

998
    my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length,
999 1000 1001
        $ori, $mid_start, $mid_end);

    $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
1002
		       \$end_seq_region, \$end_length, \$ori, \$mid_start,
1003 1004 1005 1006 1007
		       \$mid_end);

    while($sth->fetch()) {
      $end_mid_mapper->add_map_coordinates
        (
1008 1009
         $end_seq_region_id, $end_start, $end_end, $ori,
         $mid_seq_region_id, $mid_start, $mid_end
1010 1011 1012
        );

      #update sr_name cache
1013 1014 1015 1016
      my $arr = [ $end_seq_region_id,$end_seq_region,$end_cs_id,$end_length ];

      $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
      $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
1017 1018

      #register this region on the end coord system
1019
      $end_registry->check_and_register($end_seq_region_id, $end_start, $end_end);
1020
    }
1021
    $sth->finish();
1022 1023 1024 1025 1026 1027 1028 1029
  }

  #########
  # Now that both halves are loaded
  # Do stepwise mapping using both of the loaded mappers to load
  # the final start <-> end mapper
  #

1030 1031 1032 1033 1034 1035 1036
  _build_combined_mapper(\@start_ranges, $start_mid_mapper, $end_mid_mapper,
                         $combined_mapper, $start_name);
  #all done!
  return;
}


1037
=head2 _register_chained_special
1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129

  Arg [1]    : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
               The chained assembly mapper to register regions on
  Arg [2]    : string $from ('first' or 'last')
               The direction we are registering from, and the name of the
               internal mapper.
  Arg [3]    : string $seq_region_name
               The name of the seqregion we are registering on
  Arg [4]    : listref $ranges
               A list  of ranges to register (in [$start,$end] tuples).
  Arg [5]    : (optional) $to_slice
               Only register those on this Slice.
  Description: Registers a set of ranges on a chained assembly mapper.
               This function is at the heart of the chained mapping process.
               It retrieves information from the assembly table and
               dynamically constructs the mappings between two coordinate
               systems which are 2 mapping steps apart. It does this by using
               two internal mappers to load up a third mapper which is
               actually used by the ChainedAssemblyMapper to perform the
               mapping.

               This method must be called before any mapping is
               attempted on regions of interest, otherwise only gaps will
               be returned.  Note that the ChainedAssemblyMapper automatically
               calls this method when the need arises.
  Returntype : none
  Exceptions : throw if the seq_region to be registered does not exist
               or if it associated with multiple assembled pieces (bad data
               in assembly table)

               throw if the mapping between the coordinate systems cannot
               be performed in two steps, which means there is an internal
               error in the data in the meta table or in the code that creates
               the mapping paths.
  Caller     : Bio::EnsEMBL::AssemblyMapper
  Status     : Stable

=cut

sub _register_chained_special {
  my $self = shift;
  my $casm_mapper = shift;
  my $from = shift;