BaseFeatureAdaptor.pm 48.1 KB
Newer Older
1 2
=head1 LICENSE

3
Copyright [1999-2013] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4

5 6 7 8 9 10 11 12 13 14 15 16 17
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut
18 19 20 21 22


=head1 CONTACT

  Please email comments or questions to the public Ensembl
Magali Ruffier's avatar
Magali Ruffier committed
23
  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
24 25

  Questions may also be sent to the Ensembl help desk at
Magali Ruffier's avatar
Magali Ruffier committed
26
  <http://www.ensembl.org/Help/Contact>.
27 28

=cut
29 30 31

=head1 NAME

32
Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor - An Abstract Base class for all
Graham McVicker's avatar
Graham McVicker committed
33
FeatureAdaptors
34 35 36

=head1 SYNOPSIS

37
Abstract class - should not be instantiated.  Implementation of
38 39 40 41 42
abstract methods must be performed by subclasses.

=head1 DESCRIPTION

This is a base adaptor for feature adaptors. This base class is simply a way
43
of eliminating code duplication through the implementation of methods
44 45
common to all feature adaptors.

46 47
=head1 METHODS

48 49 50
=cut

package Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
51
use vars qw(@ISA @EXPORT);
52 53 54
use strict;

use Bio::EnsEMBL::DBSQL::BaseAdaptor;
55
use Bio::EnsEMBL::Utils::Cache;
56
use Bio::EnsEMBL::Utils::Exception qw(warning throw deprecate stack_trace_dump);
57
use Bio::EnsEMBL::Utils::Argument qw(rearrange);
Nathan Johnson's avatar
Nathan Johnson committed
58
use Bio::EnsEMBL::Utils::Iterator;
59
use Bio::EnsEMBL::Utils::Scalar qw/assert_ref/;
60 61 62

@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);

63 64
@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
65
our $SLICE_FEATURE_CACHE_SIZE    = 4;
66
our $MAX_SPLIT_QUERY_SEQ_REGIONS = 3;
67
our $SILENCE_CACHE_WARNINGS = 0;
68

69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
=head2 new

  Arg [1]    : list of args @args
               Superclass constructor arguments
  Example    : none
  Description: Constructor which warns if caching has been switched off
  Returntype : Bio::EnsEMBL::BaseFeatureAdaptor
  Exceptions : none
  Caller     : implementing subclass constructors
  Status     : Stable

=cut

sub new {
  my ($class, @args) = @_;
  my $self = $class->SUPER::new(@args);
  if ( defined $self->db->no_cache() && $self->db->no_cache() && ! $SILENCE_CACHE_WARNINGS) {
    warning(  "You are using the API without caching most recent features. "
            . "Performance might be affected." );
  }
  return $self;
}

Monika Komorowska's avatar
Monika Komorowska committed
92 93 94 95 96 97 98 99
=head2 start_equals_end

  Arg [1]    : (optional) boolean $newval
  Example    : $bfa->start_equals_end(1);
  Description: Getter/Setter for the start_equals_end flag.  If set
               to true sub _slice_fetch will use a simplified sql to retrieve 1bp slices.
  Returntype : boolean
  Exceptions : none
100
  Caller     : EnsemblGenomes variation DB build
Monika Komorowska's avatar
Monika Komorowska committed
101 102 103 104 105 106 107 108 109 110 111 112 113 114
  Status     : Stable
  
=cut

sub start_equals_end {
  my ( $self, $value ) = @_;

  if ( defined($value) ) {
    $self->{'start_equals_end'} = $value;
  }
  return $self->{'start_equals_end'};
}


115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
=head2 clear_cache

  Args      : None
  Example   : my $sa =
                $registry->get_adaptor( 'Mus musculus', 'Core',
                                        'Slice' );
              my $ga =
                $registry->get_adaptor( 'Mus musculus', 'Core',
                                        'Gene' );

              my $slice =
                $sa->fetch_by_region( 'Chromosome', '1', 1e8,
                                      1.05e8 );

              my $genes = $ga->fetch_all_by_Slice($slice);

              $ga->clear_cache();

  Description   : Empties the feature cache associated with this
                  feature adaptor.
  Return type   : None
  Exceptions    : None
  Caller        : General
  Status        : At risk (under development)

=cut

sub clear_cache {
143 144 145 146 147 148 149 150 151
  my ($self) = @_;
  $self->_clear_slice_feature_cache();
  if(!$self->_no_id_cache()) {
    $self->_id_cache()->clear_cache();
  }
  return;
}

sub _clear_slice_feature_cache {
152
  my ($self) = @_;
153
  %{$self->{_slice_feature_cache}} = ();
154 155 156 157 158 159
  return;
}

=head2 _slice_feature_cache
 
  Description	: Returns the feature cache if we are allowed to cache and
160 161
                will build it if we need to. We will never return a reference
                to the hash to avoid unintentional auto-vivfying caching
162 163 164 165 166 167 168 169
  Returntype 	: Bio::EnsEMBL::Utils::Cache
  Exceptions 	: None
  Caller     	: Internal

=cut

sub _slice_feature_cache {
  my ($self) = @_;
170
  return if $self->db()->no_cache();
171
  if(! exists $self->{_slice_feature_cache}) {
172 173
    tie my %cache, 'Bio::EnsEMBL::Utils::Cache', $SLICE_FEATURE_CACHE_SIZE;
    $self->{_slice_feature_cache} = \%cache;
174 175
  }
  return $self->{_slice_feature_cache};
176 177
}

178
=head2 fetch_all_by_Slice
179 180 181 182 183

  Arg [1]    : Bio::EnsEMBL::Slice $slice
               the slice from which to obtain features
  Arg [2]    : (optional) string $logic_name
               the logic name of the type of features to obtain
184 185 186 187 188
  Example    : $fts = $a->fetch_all_by_Slice($slice, 'Swall');
  Description: Returns a listref of features created from the database 
               which are on the Slice defined by $slice. If $logic_name is 
               defined only features with an analysis of type $logic_name 
               will be returned. 
Patrick Meidl's avatar
Patrick Meidl committed
189 190 191 192 193
               NOTE: only features that are entirely on the slice's seq_region
               will be returned (i.e. if they hang off the start/end of a
               seq_region they will be discarded). Features can extend over the
               slice boundaries though (in cases where you have a slice that
               doesn't span the whole seq_region).
194
  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
195 196
  Exceptions : none
  Caller     : Bio::EnsEMBL::Slice
197
  Status     : Stable
198 199 200

=cut

201
sub fetch_all_by_Slice {
202 203
  my ($self, $slice, $logic_name) = @_;
  #fetch by constraint with empty constraint
204
  return $self->fetch_all_by_Slice_constraint($slice, '', $logic_name);
205 206 207
}


208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233

=head2 fetch_Iterator_by_Slice_method

  Arg [1]    : CODE ref of Slice fetch method
  Arg [2]    : ARRAY ref of parameters for Slice fetch method
  Arg [3]    : Optional int: Slice index in parameters array
  Arg [4]    : Optional int: Slice chunk size. Default=500000
  Example    : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice_method
                               	      ($feature_adaptor->can('fetch_all_by_Slice_Arrays'),
	                                   \@fetch_method_params,
	                                   0,#Slice idx
	                                  );

               while(my $feature = $slice_iter->next && defined $feature){
                 #Do something here
               }

  Description: Creates an Iterator which chunks the query Slice to facilitate
               large Slice queries which would have previously run out of memory
  Returntype : Bio::EnsEMBL::Utils::Iterator
  Exceptions : Throws if mandatory params not valid
  Caller     : general
  Status     : at risk

=cut

234 235
#Does not support Collections. See Funcgen ResultFeatureAdaptor::fetch_collection_Iterator_by_Slice_method

236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
sub fetch_Iterator_by_Slice_method{
  my ($self, $slice_method_ref, $params_ref, $slice_idx, $chunk_size) = @_;

  if(! ( defined $slice_method_ref &&
		 ref($slice_method_ref) eq 'CODE')
	){
	throw('Must pass a valid Slice fetch method CODE ref');
  }

  if (! ($params_ref && 
		 ref($params_ref) eq 'ARRAY')) {
	#Don't need to check size here so long as we have valid Slice
	throw('You must pass a method params ARRAYREF');
  }
  
  $slice_idx    = 0 if(! defined $slice_idx);
  my $slice     = $params_ref->[$slice_idx];
  $chunk_size ||= 1000000;
		
  my @feat_cache;
  my $finished     = 0;
  my $start        = 1;	#local coord for sub slice
  my $end          = $slice->length;
  my $num_overlaps = 0;
  
  my $coderef = 
	sub {
	  
	  while (scalar(@feat_cache) == 0 &&
			 ! $finished) {
		
267
		my $new_end = ($start + $chunk_size - 1);
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
		
		if ($new_end >= $end) {
		  # this is our last chunk
		  $new_end = $end;
		  $finished = 1;  
		}
		
		#Chunk by sub slicing
		my $sub_slice             = $slice->sub_Slice($start, $new_end);
		$params_ref->[$slice_idx] = $sub_slice;
		@feat_cache = @{ $slice_method_ref->($self, @$params_ref)};
		
		#Remove & count overlapping features
		splice(@feat_cache, 0, $num_overlaps) if($num_overlaps);
		my $i;
		
		if (scalar(@feat_cache) > 0) {
		  
286
		  my $feat_end  = $feat_cache[$#feat_cache]->seq_region_end;
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
		  my $slice_end = $sub_slice->end;
		  $num_overlaps = 0;
		  
		  for ($i = $#feat_cache; $i >=0; $i--) {
			
			if ($feat_end > $slice_end) {
			  $feat_end  = $feat_cache[$i]->end;
			  $num_overlaps ++;
			} else {
			  last;
			}
			
		  }
		}
		
		# update the start coordinate
		$start = $new_end + 1;
304
	  }
305 306 307 308 309 310 311 312 313 314
	  
	  #this maybe returning from an undef cache
	  #Need to sub this out even more?
	  return shift @feat_cache;
	};

  return Bio::EnsEMBL::Utils::Iterator->new($coderef);
}


315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
=head2 fetch_Iterator_by_Slice

  Arg [1]    : Bio::EnsEMBL::Slice
  Arg [2]    : Optional string: logic name of analysis
  Arg [3]    : Optional int: Chunk size to iterate over. Default is 500000
  Example    : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice($slice);

               while(my $feature = $slice_iter->next && defined $feature){
                 #Do something here
               }

  Description: Creates an Iterator which chunks the query Slice to facilitate
               large Slice queries which would have previously run out of memory
  Returntype : Bio::EnsEMBL::Utils::Iterator
  Exceptions : None
  Caller     : general
  Status     : at risk

=cut

335 336 337 338 339 340 341 342 343
sub fetch_Iterator_by_Slice{
  my ($self, $slice, $logic_name, $chunk_size) = @_;

  my $method_ref = $self->can('fetch_all_by_Slice');

  return $self->fetch_Iterator_by_Slice_method($method_ref, [$slice, $logic_name], 0, $chunk_size);
}


344
=head2 fetch_all_by_Slice_and_score
345 346 347

  Arg [1]    : Bio::EnsEMBL::Slice $slice
               the slice from which to obtain features
348
  Arg [2]    : (optional) float $score
349 350 351
               lower bound of the the score of the features retrieved
  Arg [3]    : (optional) string $logic_name
               the logic name of the type of features to obtain
352
  Example    : $fts = $a->fetch_all_by_Slice_and_score($slice,90,'Swall');
353 354
  Description: Returns a list of features created from the database which are 
               are on the Slice defined by $slice and which have a score 
355
               greater than $score. If $logic_name is defined, 
356 357
               only features with an analysis of type $logic_name will be 
               returned. 
358
  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
359 360
  Exceptions : none
  Caller     : Bio::EnsEMBL::Slice
361
  Status     : Stable
362 363 364

=cut

365
sub fetch_all_by_Slice_and_score {
366
  my ( $self, $slice, $score, $logic_name ) = @_;
367

368
  my $constraint;
369
  if ( defined($score) ) {
370 371 372 373 374 375 376
    # Get the synonym of the primary_table
    my @tabs = $self->_tables();
    my $syn  = $tabs[0]->[1];

    $constraint = sprintf( "%s.score > %s",
                $syn,
                $self->dbc()->db_handle()->quote( $score, SQL_FLOAT ) );
377
  }
378 379 380 381

  return
    $self->fetch_all_by_Slice_constraint( $slice, $constraint,
                                          $logic_name );
382
}
383 384


385
=head2 fetch_all_by_Slice_constraint
386

387 388 389 390 391
  Arg [1]    : Bio::EnsEMBL::Slice $slice
               the slice from which to obtain features
  Arg [2]    : (optional) string $constraint
               An SQL query constraint (i.e. part of the WHERE clause)
  Arg [3]    : (optional) string $logic_name
392
               the logic name of the type of features to obtain
393 394
  Example    : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5');
  Description: Returns a listref of features created from the database which 
395 396 397 398
               are on the Slice defined by $slice and fulfill the SQL 
               constraint defined by $constraint. If logic name is defined, 
               only features with an analysis of type $logic_name will be 
               returned. 
399
  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
400 401
  Exceptions : thrown if $slice is not defined
  Caller     : Bio::EnsEMBL::Slice
402
  Status     : Stable
403 404 405

=cut

406
sub fetch_all_by_Slice_constraint {
407
  my ( $self, $slice, $constraint, $logic_name ) = @_;
408

409
  my @result;
410

411 412
  #Pull out here as we need to remember them & reset accordingly
  my $bind_params = $self->bind_param_generic_fetch();
413

414 415 416 417
  if ( !ref($slice)
       || !(    $slice->isa('Bio::EnsEMBL::Slice')
             or $slice->isa('Bio::EnsEMBL::LRGSlice') ) )
  {
Graham McVicker's avatar
Graham McVicker committed
418
    throw("Bio::EnsEMBL::Slice argument expected.");
419 420
  }

421
  $constraint ||= '';
422 423
  $constraint =
    $self->_logic_name_to_constraint( $constraint, $logic_name );
424

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
425
  # If the logic name was invalid, undef was returned
426
  if ( !defined($constraint) ) { return [] }
427

428
  my $key;
429
  my $cache;
430

Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
431
  # Will only use feature_cache if hasn't been no_cache attribute set
432 433
  if (
    !( defined( $self->db()->no_cache() ) && $self->db()->no_cache() ) )
434 435
  {

436
    #strain test and add to constraint if so to stop caching.
437 438 439 440 441 442 443 444
    if ( $slice->isa('Bio::EnsEMBL::StrainSlice') ) {
      my $string =
        $self->dbc()->db_handle()->quote( $slice->strain_name() );

      if ( $constraint ne "" ) {
        $constraint .= " AND $string = $string ";
      } else {
        $constraint .= " $string = $string ";
445 446 447
      }
    }

448 449 450 451 452 453 454 455 456 457
    # Check the cache and return the cached results if we have already
    # done this query.  The cache key is the made up from the slice
    # name, the constraint, and the bound parameters (if there are any).
    $key = uc( join( ':', $slice->name(), $constraint ) );

    if ( defined($bind_params) ) {
      $key .= ':'
        . join( ':', map { $_->[0] . '/' . $_->[1] } @{$bind_params} );
    }

458 459
    $cache = $self->_slice_feature_cache();
    if ( exists( $cache->{$key} ) ) {
460 461
      # Clear the bound parameters and return the cached data.
      $self->{'_bind_param_generic_fetch'} = ();
462 463 464 465 466
      #IMPORTANT: NEVER EVER RETURN A COPY OF THE DATA STRUCTURE.
      #           This will hold arrays of values. Since we've been doing
      #           this for so long people are expecting multiple calls
      #           to fetch_by_SliceXXXXX() methods to return the same
      #           array reference.
467
      return $cache->{$key};
Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
468
    }
469
  } ## end if ( !( defined( $self...)))
470

471

472 473
  my $proj_ref = $self->_get_and_filter_Slice_projections($slice);
  my $bounds = $self->_generate_feature_bounds($slice); 
474

475
  # fetch features for the primary slice AND all symlinked slices
476
  foreach my $seg (@{$proj_ref}) {
477 478
    # re-bind the params
    $self->_bind_param_generic_fetch($bind_params); 
479 480 481
    my $offset    = $seg->from_start();
    my $seg_slice = $seg->to_Slice();
    my $features =
482
      $self->_slice_fetch( $seg_slice, $constraint );
483

484 485 486
    # If this was a symlinked slice offset the feature coordinates as
    # needed.
    if ( $seg_slice->name() ne $slice->name() ) {
487 488

    FEATURE:
489 490 491 492
      foreach my $f ( @{$features} ) {
        if ( $offset != 1 ) {
          $f->{'start'} += $offset - 1;
          $f->{'end'}   += $offset - 1;
493
        }
494 495

        # discard boundary crossing features from symlinked regions
496
        foreach my $bound (@{$bounds}) {
497
          if ( $f->{'start'} < $bound && $f->{'end'} >= $bound ) {
498 499 500 501
            next FEATURE;
          }
        }

502
        $f->{'slice'} = $slice;
503
        push( @result, $f );
504
      }
505 506
    } else {
      push( @result, @{$features} );
507
    }
508
  } ## end foreach my $seg (@proj)
509

510 511
  # Will only use feature_cache when set attribute no_cache in DBAdaptor
  if ( defined($key) ) {
512
    $cache->{$key} = \@result;
513
  }
514

515
  return \@result;
516
} ## end sub fetch_all_by_Slice_constraint
517 518


519 520
=head2 fetch_all_by_logic_name

521
  Arg [1]    : string $logic_name
522 523
               the logic name of the type of features to obtain
  Example    : $fs = $a->fetch_all_by_logic_name('foobar');
524 525 526 527 528
  Description: Returns a listref of features created from the database.
               only features with an analysis of type $logic_name will
               be returned.  If the logic name is invalid (not in the
               analysis table), a reference to an empty list will be
               returned.
529
  Returntype : listref of Bio::EnsEMBL::SeqFeatures
530
  Exceptions : thrown if no $logic_name
531
  Caller     : General
532
  Status     : Stable
533 534 535 536

=cut

sub fetch_all_by_logic_name {
537 538 539 540 541 542 543 544
  my ( $self, $logic_name ) = @_;

  if ( !defined($logic_name) ) {
    throw("Need a logic_name");
  }

  my $constraint = $self->_logic_name_to_constraint( '', $logic_name );

545 546 547 548 549
  if ( !defined($constraint) ) {
    warning("Invalid logic name: $logic_name");
    return [];
  }

550
  return $self->generic_fetch($constraint);
551 552
}

553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
=head2 fetch_all_by_stable_id_list

  Arg [1]    : string $logic_name
               the logic name of the type of features to obtain
  Arg [2]    : Bio::EnsEMBL::Slice $slice
               the slice from which to obtain features
  Example    : $fs = $a->fetch_all_by_stable_id_list(["ENSG00001","ENSG00002", ...]);
  Description: Returns a listref of features identified by their stable IDs.
               This method only fetches features of the same type as the calling
               adaptor. 
               Results are constrained to a slice if the slice is provided.
  Returntype : listref of Bio::EnsEMBL::Feature
  Exceptions : thrown if no stable ID list is provided.
  Caller     : General
  Status     : Stable

=cut

# Adapted from BaseAdaptor->uncached_fetch_all_by_dbID_list
sub fetch_all_by_stable_id_list {
  my ( $self, $id_list_ref, $slice ) = @_;

  return $self->_uncached_fetch_all_by_id_list($id_list_ref,$slice,"stable_id");
}

578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
# Method that creates an object.  Called by the _objs_from_sth() method
# in the sub-classes (the various feature adaptors).  Overridden by the
# feature collection classes.

sub _create_feature {
  my ( $self, $feature_type, $args ) = @_;
  return $feature_type->new( %{$args} );
}

# This is the same as the above, but calls the new_fast() constructor of
# the feature type.

sub _create_feature_fast {
  my ( $self, $feature_type, $args ) = @_;
  return $feature_type->new_fast($args);
}

595
=head2 count_by_Slice_constraint
Monika Komorowska's avatar
Monika Komorowska committed
596

597 598 599
    Arg [1]     : Bio::EnsEMBL::Slice
    Arg [2]     : String Custom SQL constraint
    Arg [3]     : String Logic name to search by
600
    Description : Finds all features with at least partial overlap to the given
601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
                  slice and sums them up.
                  Explanation of workings with projections:
                  
                  |-------------------------Constraint Slice---------------------------------|
                  |             |                                           |                |
                  |--Segment 1--|                                           |                |
                  |             |--Segment 2, on desired Coordinate System--|                |
                  |             |                                           |---Segment 3----|
              #Feature 1#    #Feature 2#                               #Feature 3#           |
                  |         #####################Feature 4####################               |
                  | #Feature 5# |                                           |                |
                  
                  Feature 1 is overlapping the original constraint. Counted in Segment 1
                  Feature 2,3 and 4  are counted when inspecting Segment 2
                  Feature 5 is counted in Segment 1
                  
617 618
    Returntype  : Integer
=cut
619

620
sub count_by_Slice_constraint {
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636
    my ($self, $slice, $constraint, $logic_name) = @_;
    
    assert_ref($slice, 'Bio::EnsEMBL::Slice', 'slice');
    
    my $count = 0;
    
    #Remember the bind params
    my $bind_params = $self->bind_param_generic_fetch();
    
    #Table synonym
    my @tables = $self->_tables;
    my ($table_name, $table_synonym) = @{ $tables[0] };
    
    # Basic constraints limit the feature hits to the starting Slice constraint
    $constraint ||= '';
    $constraint = $self->_logic_name_to_constraint( $constraint, $logic_name );
637
  
638 639 640 641 642 643 644 645 646 647 648 649
    return $count if ! defined $constraint;
  
    #Query logic
    my $sa = $slice->adaptor();
    my $projections = $self->_get_and_filter_Slice_projections($slice);

    my $segment_count = @{$projections};
    #Manual loop to support look-ahead/behind
    for (my $i = 0; $i < $segment_count; $i++) {
        my $seg = $projections->[$i];
        my $seg_slice = $seg->to_Slice();
        $self->_bind_param_generic_fetch($bind_params);
650
    
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
        # We cannot filter boundary crossing features in code, so we constrain the
        # SQL. We detect when we are not on the original query Slice and manually
        # filter by the segment Slice's start and end. If we are on "earlier" (5')
        # projection segments, we only limit by the *end* and when we are on the later
        # projection segments, we filter by the *start*.
        
        # This is the same as fetch_all_by_Slice_constraint()'s in-memory filtering
        # except we need to alter projected features in that code with an offset
        my $local_constraint = $constraint;
        if ( $seg_slice->name() ne $slice->name() ) {
            my ($limit_start, $limit_end) = (1,1); #fully constrained by default, flags are reset every iteration
            if($i == 0) {
                $limit_start = 0;
            } elsif ($i == ($segment_count - 1)) {
                $limit_end = 0; #don't check end as we are on the final projection
            }
            $local_constraint .= ' AND ' if $local_constraint;
            my @conditionals;
669
      
670 671 672 673 674 675 676 677
            if($limit_end) {
            # Restrict features to the end of this projection segment when after the named segment 
                push(@conditionals, sprintf('%1$s.seq_region_end <= %2$d', $table_synonym, $seg_slice->end));
            }
            if($limit_start) {
            #Do not cross the start boundary on this projection segment
                 push(@conditionals, sprintf('%1$s.seq_region_start >= %2$d', $table_synonym, $seg_slice->start));
            }
678
      
679 680
            $local_constraint .= join(q{ AND }, @conditionals);
        }
681
    
682 683 684
  	    my $count_array = $self->_get_by_Slice($seg_slice, $local_constraint, 'count');
  	    $count += $_ for @$count_array;
    }
685
	
686 687
	return $count;
}
688

689 690 691
=head2 _get_and_filter_Slice_projections

    Arg [1]     : Bio::EnsEMBL::Slice
692 693
    Description : Delegates onto SliceAdaptor::fetch_normalized_slice_projection() 
                  with filtering on
694 695 696 697 698 699 700
    Returntype  : ArrayRef Bio::EnsEMBL::ProjectionSegment; Returns an array
                  of projected segments
=cut

sub _get_and_filter_Slice_projections {
  my ($self, $slice) = @_;
  my $sa = $slice->adaptor();
701 702
  my $filter_projections = 1;
  return $sa->fetch_normalized_slice_projection($slice, $filter_projections);
703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
}

=head2 _generate_feature_bounds

    Arg [1]     : Bio::EnsEMBL::Slice
    Description : Performs a projection of Slice and records the bounds
                  of that projection. This can be used later on to restrict
                  Features which overlap into unwanted areas such as
                  regions which exist on another HAP/PAR region.
                  
                  Bounds are defined as projection_start - slice_start + 1.
    Example     : my $bounds = $self->_generate_feature_bounds($slice);
    Returntype  : ArrayRef Integer; Returns the location of the bounds.
=cut

sub _generate_feature_bounds {
  my ($self, $slice) = @_;
  my $sa = $slice->adaptor();
  # construct list of Hap/PAR boundaries for entire seq region
  my @bounds;

  my $sr_id = $slice->get_seq_region_id();
  my $ent_slice = $sa->fetch_by_seq_region_id($sr_id);
  if ( $slice->strand() == -1 ) {
    $ent_slice = $ent_slice->invert();
  }

  my @ent_proj = @{ $sa->fetch_normalized_slice_projection($ent_slice) };
  shift(@ent_proj);    # skip first; 1st does not have bounds normally; may change if we ever have a patch a pos 1 

  @bounds = map { $_->from_start() - $slice->start() + 1 } @ent_proj;
  
  return \@bounds;
}

738 739 740 741 742 743 744 745
=head2 _get_by_Slice
    Arg [0]    : Bio::EnsEMBL::Slice to find all the features within
    Arg [1]    : SQL constraint string
    Arg [2]    : Type of query to run. Default behaviour is to select, but 
                 'count' is also valid
    Description: Abstracted logic from _slice_fetch
    Returntype : Listref of Bio::EnsEMBL::Feature, or integers for counting mode
=cut
746

747
sub _get_by_Slice {
748
    my ($self, $slice, $orig_constraint, $query_type) = @_;
749 750 751 752 753 754 755 756 757 758
    
    # features can be scattered across multiple coordinate systems
    my @tables = $self->_tables;
    my ($table_name, $table_synonym) = @{ $tables[0] };
    my $mapper;
    my @feature_coord_systems;
    
    my $meta_values = $self->db->get_MetaContainer->list_value_by_key( $table_name."build.level");
    if ( @$meta_values and $slice->is_toplevel() ) {
        push @feature_coord_systems, $slice->coord_system();
759
    } else {
760
        @feature_coord_systems = @{ $self->db->get_MetaCoordContainer->fetch_all_CoordSystems_by_feature_type($table_name)};
761 762
        unless( @feature_coord_systems) {
            warning("No CoordinateSystems defined for $table_name table. Please"
Magali Ruffier's avatar
Magali Ruffier committed
763
            ." check meta_coord table and consider running ensembl/misc-scripts/meta_coord/update_meta_coord.pl");
764
        }
765 766 767 768
    }
	
    my $assembly_mapper_adaptor = $self->db->get_AssemblyMapperAdaptor();
    my @pan_coord_features;
Dan Staines's avatar
Dan Staines committed
769
        
770 771 772
COORD_SYSTEM: foreach my $coord_system (@feature_coord_systems) {
        my @query_accumulator;
        # Build up a combination of query constraints that will quickly establish the result set
773
        my $constraint = $orig_constraint;
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
        if ( $coord_system->equals( $slice->coord_system ) ) {
            my $max_len = $self->_max_feature_length
                || $self->db->get_MetaCoordContainer
                    ->fetch_max_length_by_CoordSystem_feature_type( $coord_system,$table_name );
                       
            my $seq_region_id;
            if ( $slice->adaptor ) {
                $seq_region_id = $slice->adaptor->get_seq_region_id($slice);
            } else {
                $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice);
            }
            
            my @seq_region_ids = ($seq_region_id);
            while (1) {
                my $ext_seq_region_id = $self->get_seq_region_id_external($seq_region_id);
        
                if ( $ext_seq_region_id == $seq_region_id ) { last }
        
                push( @seq_region_ids, $ext_seq_region_id );
                $seq_region_id = $ext_seq_region_id;
            }
            
            $constraint .= " AND " if ($constraint);

            $constraint .= ${table_synonym}.".seq_region_id IN (". join( ',', @seq_region_ids ) . ") AND ";
            
            #faster query for 1bp slices where SNP data is not compressed
            if ( $self->start_equals_end && $slice->start == $slice->end ) {
                $constraint .= " AND ".$table_synonym.".seq_region_start = ".$slice->end .
                  " AND ".$table_synonym.".seq_region_end = ".$slice->start;
            
            } else {
                if ( !$slice->is_circular() ) {
                    # Deal with the default case of a non-circular chromosome.
                    $constraint .= $table_synonym.".seq_region_start <= ".$slice->end." AND "
                                   .$table_synonym.".seq_region_end >= ".$slice->start;
            
                    if ( $max_len ) {
                        my $min_start = $slice->start - $max_len;
                        $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
                    }
            
                } else {
                    # Deal with the case of a circular chromosome.
                    if ( $slice->start > $slice->end ) {
                        $constraint .= " ( ".$table_synonym.".seq_region_start >= ".$slice->start
                            . " OR ".$table_synonym.".seq_region_start <= ".$slice->end
                            . " OR ".$table_synonym.".seq_region_end >= ".$slice->start
                            . " OR ".$table_synonym.".seq_region_end <= ".$slice->end
823
                            . " OR ".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end)";
824 825 826 827 828 829 830 831 832 833 834
                    } else {
                        $constraint .= " ((".$table_synonym.".seq_region_start <= ".$slice->end
                            . " AND ".$table_synonym.".seq_region_end >= ".$slice->start.") "
                            . "OR (".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end"
                            . " AND (".$table_synonym.".seq_region_start <= ".$slice->end
                            . " OR ".$table_synonym.".seq_region_end >= ".$slice->start.")))";
                  }
              }
           }
           push @query_accumulator, [$constraint,undef,$slice]; # $mapper intentionally absent here.
           
Dan Staines's avatar
Dan Staines committed
835 836
        } else { 
        	#coordinate systems do not match
837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888
            $mapper = $assembly_mapper_adaptor->fetch_by_CoordSystems( $slice->coord_system(), $coord_system );
            next unless defined $mapper;

            # Get list of coordinates and corresponding internal ids for
            # regions the slice spans
            my @coords = $mapper->map( $slice->seq_region_name, $slice->start, $slice->end,
                                    $slice->strand, $slice->coord_system );

            @coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords;

            next COORD_SYSTEM if ( !@coords );

            my @ids = map { $_->id() } @coords;
            #coords are now id rather than name
            
            if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice') 
                    && $slice->coord_system->name() ne 'lrg') {
                $constraint = $orig_constraint;
                my $id_str = join( ',', @ids );
                $constraint .= " AND " if ($constraint);
                $constraint .= $table_synonym.".seq_region_id IN ($id_str)";
                
                push @query_accumulator, [$constraint,$mapper,$slice];
            } else {
                my $max_len = (
                    $self->_max_feature_length()
                    || $self->db->get_MetaCoordContainer
                       ->fetch_max_length_by_CoordSystem_feature_type($coord_system, $table_name) 
                );

                my $length = @coords;
                for ( my $i = 0; $i < $length; $i++ ) {
                    $constraint = $orig_constraint;
                    $constraint .= " AND " if ($constraint);
                    $constraint .= $table_synonym.".seq_region_id = "
                        . $ids[$i] . " AND "
                        . $table_synonym.".seq_region_start <= "
                        . $coords[$i]->end() . " AND "
                        . $table_synonym.".seq_region_end >= "
                        . $coords[$i]->start();

                    if ($max_len) {
                        my $min_start = $coords[$i]->start() - $max_len;
                        $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
                    }
                    
                    push @query_accumulator, [$constraint,$mapper,$slice];
                } # end multi-query cycle
		    } # end else
            
	   } # end else (coord sytems not matching)
	   
889 890 891
	   #Record the bind params if we have to do multiple queries
	   my $bind_params = $self->bind_param_generic_fetch();
	   
892 893
	   foreach my $query (@query_accumulator) {
	       my ($local_constraint,$local_mapper,$local_slice) = @$query;
894
	       $self->_bind_param_generic_fetch($bind_params);
895
    	   if ($query_type and $query_type eq 'count') {
896 897 898 899 900 901
  	       push @pan_coord_features, $self->generic_count($local_constraint);
    	   } 
    	   else {
           my $features = $self->generic_fetch( $local_constraint, $local_mapper, $local_slice );
           $features = $self->_remap( $features, $local_mapper, $local_slice );
           push @pan_coord_features, @$features;
902 903 904 905 906 907 908 909 910 911 912
    	   }
	   }
	   $mapper = undef;
    } # End foreach
    return \@pan_coord_features;
}
#
# helper function used by fetch_all_by_Slice_constraint method
#
sub _slice_fetch {
  my ( $self, $slice, $orig_constraint ) = @_;
913

914
  my $features = $self->_get_by_Slice($slice,$orig_constraint);
915

916
  return $features;
917
} ## end sub _slice_fetch
918

919

920
#for a given seq_region_id, gets the one used in an external database, if present, otherwise, returns the internal one
921 922
sub get_seq_region_id_external {
  my ( $self, $sr_id ) = @_;
923 924 925
  my $cs_a = $self->db()->get_CoordSystemAdaptor();
  return ( exists( $cs_a->{'_internal_seq_region_mapping'}->{$sr_id} )
           ? $cs_a->{'_internal_seq_region_mapping'}->{$sr_id}
926
           : $sr_id );
927 928 929 930
}

#for a given seq_region_id and coord_system, gets the one used in the internal (core) database
sub get_seq_region_id_internal{
931 932 933 934 935
  my ( $self, $sr_id ) = @_;
  my $cs_a = $self->db()->get_CoordSystemAdaptor();
  return (  exists $cs_a->{'_external_seq_region_mapping'}->{$sr_id} 
            ? $cs_a->{'_external_seq_region_mapping'}->{$sr_id} 
            : $sr_id);
936
}
937

938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955
#
# Helper function containing some common feature storing functionality
#
# Given a Feature this will return a copy (or the same feature if no changes 
# to the feature are needed) of the feature which is relative to the start
# of the seq_region it is on. The seq_region_id of the seq_region it is on
# is also returned.
#
# This method will also ensure that the database knows which coordinate
# systems that this feature is stored in.
#

sub _pre_store {
  my $self    = shift;
  my $feature = shift;

  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
    throw('Expected Feature argument.');
956
  }
957
  my $slice = $feature->slice();
958

959
  $self->_check_start_end_strand($feature->start(),$feature->end(),
960
                                 $feature->strand(), $slice);
961

962 963 964 965

  my $db = $self->db();

  my $slice_adaptor = $db->get_SliceAdaptor();
966

Ian Longden's avatar
Ian Longden committed
967
  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))  ) {
968 969 970
    throw('Feature must be attached to Slice to be stored.');
  }

971
  # make sure feature coords are relative to start of entire seq_region
972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022

  if($slice->start != 1 || $slice->strand != 1) {
    #move feature onto a slice of the entire seq_region
    $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(),
                                             $slice->seq_region_name(),
                                             undef, #start
                                             undef, #end
                                             undef, #strand
                                             $slice->coord_system->version());

    $feature = $feature->transfer($slice);

    if(!$feature) {
      throw('Could not transfer Feature to slice of ' .
            'entire seq_region prior to storing');
    }
  }

  # Ensure this type of feature is known to be stored in this coord system.
  my $cs = $slice->coord_system;

  my ($tab) = $self->_tables();
  my $tabname = $tab->[0];

  my $mcc = $db->get_MetaCoordContainer();

  $mcc->add_feature_type($cs, $tabname, $feature->length);

  my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);

  if(!$seq_region_id) {
    throw('Feature is associated with seq_region which is not in this DB.');
  }

  return ($feature, $seq_region_id);
}


# The same function as _pre_store
# This one is used to store user uploaded features in XXX_userdata db

sub _pre_store_userdata {
  my $self    = shift;
  my $feature = shift;

  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
    throw('Expected Feature argument.');
  }

  my $slice = $feature->slice();
  my $slice_adaptor = $slice->adaptor;
1023 1024 1025 1026
  
  $self->_check_start_end_strand($feature->start(),$feature->end(),
                                 $feature->strand(), $slice);

1027

Ian Longden's avatar
Ian Longden committed
1028
  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
1029 1030 1031 1032 1033
    throw('Feature must be attached to Slice to be stored.');
  }

  # make sure feature coords are relative to start of entire seq_region

1034
  if($slice->start != 1 || $slice->strand != 1) {
1035
    #move feature onto a slice of the entire seq_region
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
    $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(),
                                             $slice->seq_region_name(),
                                             undef, #start
                                             undef, #end
                                             undef, #strand
                                             $slice->coord_system->version());

    $feature = $feature->transfer($slice);

    if(!$feature) {
      throw('Could not transfer Feature to slice of ' .
            'entire seq_region prior to storing');
    }
  }

1051
  # Ensure this type of feature is known to be stored in this coord system.
1052 1053 1054 1055 1056
  my $cs = $slice->coord_system;

  my ($tab) = $self->_tables();
  my $tabname = $tab->[0];

Anne Parker's avatar
 
Anne Parker committed
1057
  my $db = $self->db;
1058 1059
  my $mcc = $db->get_MetaCoordContainer();

Laura Clarke's avatar
 
Laura Clarke committed
1060
  $mcc->add_feature_type($cs, $tabname, $feature->length);
1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080

  my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);

  if(!$seq_region_id) {
    throw('Feature is associated with seq_region which is not in this DB.');
  }

  return ($feature, $seq_region_id);
}


#
# helper function used to validate start/end/strand and 
# hstart/hend/hstrand etc.
#
sub _check_start_end_strand {
  my $self = shift;
  my $start = shift;
  my $end   = shift;
  my $strand = shift;
1081
  my $slice = shift;
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094

  #
  # Make sure that the start, end, strand are valid
  #
  if(int($start) != $start) {
    throw("Invalid Feature start [$start].  Must be integer.");
  }
  if(int($end) != $end) {
    throw("Invalid Feature end [$end]. Must be integer.");
  }
  if(int($strand) != $strand || $strand < -1 || $strand > 1) {
    throw("Invalid Feature strand [$strand]. Must be -1, 0 or 1.");
  }
1095
  if($end < $start && !$slice->is_circular()) {
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
    throw("Invalid Feature start/end [$start/$end]. Start must be less " .
          "than or equal to end.");
  }

  return 1;
}


#
# Given a list of features checks if they are in the correct coord system
# by looking at the first features slice.  If they are not then they are
# converted and placed on the slice.
#
sub _remap {
1110
  my ( $self, $features, $mapper, $slice ) = @_;
1111 1112 1113 1114 1115

  #check if any remapping is actually needed
  if(@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') ||
                    $features->[0]->slice == $slice)) {
    return $features;
1116
  }
1117

1118
  #remapping has not been done, we have to do our own conversion from
1119 1120 1121 1122 1123 1124 1125 1126 1127
  #to slice coords

  my @out;

  my $slice_start = $slice->start();
  my $slice_end   = $slice->end();
  my $slice_strand = $slice->strand();
  my $slice_cs    = $slice->coord_system();

1128
  my ($seq_region_id, $start, $end, $strand);
1129

1130
  my $slice_seq_region_id = $slice->get_seq_region_id();
1131

1132
  foreach my $f (@$features) {
1133
    #since feats were obtained in contig coords, attached seq is a contig
1134 1135 1136 1137
    my $fslice = $f->slice();
    if(!$fslice) {
      throw("Feature does not have attached slice.\n");
    }
1138
    my $fseq_region_id = $fslice->get_seq_region_id();
Arne Stabenau's avatar