SequenceAdaptor.pm 18.3 KB
Newer Older
1 2
=head1 LICENSE

3
Copyright [1999-2013] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4

5 6 7 8 9 10 11 12 13 14 15 16 17
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut
18 19 20 21 22


=head1 CONTACT

  Please email comments or questions to the public Ensembl
Magali Ruffier's avatar
Magali Ruffier committed
23
  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
24 25

  Questions may also be sent to the Ensembl help desk at
Magali Ruffier's avatar
Magali Ruffier committed
26
  <http://www.ensembl.org/Help/Contact>.
27 28

=cut
29 30 31

=head1 NAME

Graham McVicker's avatar
Graham McVicker committed
32
Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations
33 34 35

=head1 SYNOPSIS

36 37 38 39
  my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' );

  my $dna =
    ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) };
40 41 42

=head1 DESCRIPTION

43
An adaptor for the retrieval of DNA sequence from the EnsEMBL database
44

45
=head1 METHODS
46 47 48 49

=cut

package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
50

51
use vars qw(@ISA @EXPORT);
52
use strict;
53
use warnings;
54

55
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
56
use Bio::EnsEMBL::Utils::Exception qw(throw deprecate);
57
use Bio::EnsEMBL::Utils::Sequence  qw(reverse_comp);
58
use Bio::EnsEMBL::Utils::Cache;
59
use Bio::EnsEMBL::Utils::Scalar qw( assert_ref );
60

61
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
62

63 64 65 66 67
our $SEQ_CHUNK_PWR   = 18; # 2^18 = approx. 250KB
our $SEQ_CACHE_SZ    = 5;
our $SEQ_CACHE_MAX   = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ;

@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
68 69 70 71 72 73 74

=head2 new

  Arg [1]    : none
  Example    : my $sa = $db_adaptor->get_SequenceAdaptor();
  Description: Constructor.  Calls superclass constructor and initialises
               internal cache structure.
75
  Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor
76 77
  Exceptions : none
  Caller     : DBAdaptor::get_SequenceAdaptor
78
  Status     : Stable
79 80 81 82 83

=cut

sub new {
  my $caller = shift;
84

85
  my $class = ref($caller) || $caller;
86

87 88
  my $self = $class->SUPER::new(@_);

89 90 91 92 93
  # use an LRU cache to limit the size
  my %seq_cache;
  tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ);

  $self->{'seq_cache'} = \%seq_cache;
94

95 96 97 98 99

#
# See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these
# in a  hash.
#
100
  my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code = "_rna_edit"');
101 102 103 104 105 106 107 108
  
  $sth->execute();
  my ($seq_region_id, $value);
  $sth->bind_columns(\$seq_region_id, \$value);
  my %edits;
  my $count = 0;
  while($sth->fetch()){
    $count++;
109 110 111 112 113 114
    my ($start, $end, $substring) = split (/\s+/, $value);
    my $edit_length = ($end - $start) + 1;
    my $substring_length = length($substring);
    if($edit_length != $substring_length) {
      throw "seq_region_id $seq_region_id has an attrib of type '_rna_edit' (value '$value'). Edit length ${edit_length} is not the same as the replacement's length ${substring_length}. Please fix. We only support substitutions via this mechanism";
    }
115 116 117 118 119 120 121
    push @{$edits{$seq_region_id}}, $value;
  }
  $sth->finish;
  if($count){
    $self->{_rna_edits_cache} = \%edits;
  }
  
122 123
  return $self;
}
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139

=head2 clear_cache

  Example			: $sa->clear_cache();
  Description	: Removes all entries from the associcated sequence cache
  Returntype 	: None
  Exceptions 	: None

=cut

sub clear_cache {
  my ($self) = @_;
  %{$self->{seq_cache}} = ();
  return;
}

140

141
=head2 fetch_by_Slice_start_end_strand
142

Graham McVicker's avatar
Graham McVicker committed
143 144
  Arg  [1]   : Bio::EnsEMBL::Slice slice
               The slice from which you want the sequence
145 146 147 148 149 150 151 152
  Arg  [2]   : (optional) int startBasePair 
               The start base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine.
               default = 1
  Arg  [3]   : (optional) int endBasePair
               The end base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine,
               but the end must be greater than or equal to the start
Graham McVicker's avatar
Graham McVicker committed
153
               count from 1
154 155
               default = the length of the slice
  Arg  [4]   : (optional) int strand 
Graham McVicker's avatar
Graham McVicker committed
156
               1, -1
157
               default = 1
Graham McVicker's avatar
Graham McVicker committed
158 159 160 161
  Example    : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, 
                                                                  1000, -1);
  Description: retrieves from db the sequence for this slice
               uses AssemblyMapper to find the assembly
Graham McVicker's avatar
Graham McVicker committed
162
  Returntype : string 
Graham McVicker's avatar
Graham McVicker committed
163 164
  Exceptions : endBasePair should be less or equal to length of slice 
  Caller     : Bio::EnsEMBL::Slice::seq(), Slice::subseq() 
165
  Status     : Stable
166 167 168

=cut

169 170
sub fetch_by_Slice_start_end_strand {
   my ( $self, $slice, $start, $end, $strand ) = @_;
171

Ian Longden's avatar
Ian Longden committed
172
   if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
173
     throw("Slice argument is required.");
174 175
   }

176
   $start = 1 if(!defined($start));
177

178
 
179
   if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) {
180 181 182
         
       if ( !defined($end) || ($start > $end ) ) {
	   return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand );
Eugene Kulesha's avatar
Eugene Kulesha committed
183 184
       }

185 186 187 188
       if ( defined($end) && ($end < 0) ) {
	   $end += $slice->seq_region_length;
       }
       
Eugene Kulesha's avatar
Eugene Kulesha committed
189
       if ($start < 0) {
190
           $start += $slice->seq_region_length;
Eugene Kulesha's avatar
Eugene Kulesha committed
191
       }
192

193
       if($slice->start> $slice->end) {
194
           return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand );
Eugene Kulesha's avatar
Eugene Kulesha committed
195
       }
196 197 198 199 200
  }
        
  if ( ( !defined($end) ) && (not $slice->is_circular) ) {
           $end = $slice->end() - $slice->start() + 1;
  }
201

202 203 204
  if ( $start > $end ) {
      throw("Start must be less than or equal to end.");
  }
205 206 207 208 209 210

   $strand ||= 1;

   #get a new slice that spans the exact region to retrieve dna from
   my $right_expand  = $end - $slice->length(); #negative is fine
   my $left_expand   = 1 - $start; #negative is fine
211

212 213
   if($right_expand || $left_expand) {
     $slice = $slice->expand($left_expand, $right_expand);
214
   }
215 216 217 218 219 220 221 222 223

   #retrieve normalized 'non-symlinked' slices
   #this allows us to support haplotypes and PARs
   my $slice_adaptor = $slice->adaptor();
   my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};

   if(@symproj == 0) {
     throw('Could not retrieve normalized Slices. Database contains ' .
           'incorrect assembly_exception information.');
224
   }
225 226 227 228 229 230 231 232 233 234 235 236

   #call this method again with any slices that were 'symlinked' to by this
   #slice
   if(@symproj != 1 || $symproj[0]->[2] != $slice) {
     my $seq;
     foreach my $segment (@symproj) {
       my $symlink_slice = $segment->[2];
       #get sequence from each symlinked area
       $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
                                                        1,undef,1)};
     }
     if($strand == -1) {
237
       reverse_comp(\$seq);
238 239
     }
     return \$seq;
240
   }
241

242 243 244 245 246
   # we need to project this slice onto the sequence coordinate system
   # even if the slice is in the same coord system, we want to trim out
   # flanking gaps (if the slice is past the edges of the seqregion)
   my $csa = $self->db->get_CoordSystemAdaptor();
   my $seqlevel = $csa->fetch_sequence_level();
247

248
   my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
249

250 251 252
   my $seq = '';
   my $total = 0;
   my $tmp_seq;
253

254 255 256
   #fetch sequence from each of the sequence regions projected onto
   foreach my $segment (@projection) {
     my ($start, $end, $seq_slice) = @$segment;
257

258 259 260 261 262
     #check for gaps between segments and pad them with Ns
     my $gap = $start - $total - 1;
     if($gap) {
       $seq .= 'N' x $gap;
     }
263

264 265 266 267
     my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);

     $tmp_seq = ${$self->_fetch_seq($seq_region_id,
                                    $seq_slice->start, $seq_slice->length())};
268

269 270
     #reverse compliment on negatively oriented slices
     if($seq_slice->strand == -1) {
271
       reverse_comp(\$tmp_seq);
272
     }
273 274 275 276

     $seq .= $tmp_seq;

     $total = $end;
277
   }
278 279 280 281 282 283 284 285 286 287 288 289 290 291

   #check for any remaining gaps at the end
   my $gap = $slice->length - $total;
   if($gap) {
     $seq .= 'N' x $gap;
   }

   #if the sequence is too short it is because we came in with a seqlevel
   #slice that was partially off of the seq_region.  Pad the end with Ns
   #to make long enough
   if(length($seq) != $slice->length()) {
     $seq .= 'N' x ($slice->length() - length($seq));
   }

292 293 294 295
   if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
     $self->_rna_edit($slice,\$seq);
   }

296
   #if they asked for the negative slice strand revcomp the whole thing
297
   reverse_comp(\$seq) if($strand == -1);
298 299

   return \$seq;
300 301 302
}


303 304 305 306
sub _fetch_by_Slice_start_end_strand_circular {
  my ( $self, $slice, $start, $end, $strand ) = @_;

  assert_ref( $slice, 'Bio::EnsEMBL::Slice' );
307
  
308 309 310 311 312 313
  $strand ||= 1;
  if ( !defined($start) ) {
    $start ||= 1;
  }

  if ( !defined($end) ) {
Eugene Kulesha's avatar
Eugene Kulesha committed
314
      $end = $slice->end() - $slice->start() + 1;
315 316 317
  }

  if ( $start > $end && $slice->is_circular() ) {
318
    my ($seq, $seq1, $seq2);
319

320 321 322
    my $midpoint = $slice->seq_region_length - $slice->start + 1;
    $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1,  $midpoint, 1 )};
    $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )};
Eugene Kulesha's avatar
Eugene Kulesha committed
323

324
    $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1";
325 326 327 328 329 330

    reverse_comp( \$seq ) if ( $strand == -1 );

    return \$seq;
  }

331 332


333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
  # Get a new slice that spans the exact region to retrieve dna from
  my $right_expand = $end - $slice->length();    #negative is fine
  my $left_expand  = 1 - $start;                 #negative is fine

  if ( $right_expand || $left_expand ) {
    $slice =
        $slice->strand > 0
      ? $slice->expand( $left_expand,  $right_expand )
      : $slice->expand( $right_expand, $left_expand );
  }

  # Retrieve normalized 'non-symlinked' slices.  This allows us to
  # support haplotypes and PARs.
  my $slice_adaptor = $slice->adaptor();
  my @symproj =
    @{ $slice_adaptor->fetch_normalized_slice_projection($slice) };

  if ( @symproj == 0 ) {
    throw(   'Could not retrieve normalized Slices. Database contains '
           . 'incorrect assembly_exception information.' );
  }

  # Call this method again with any slices that were 'symlinked' to by
  # this slice.
  if ( @symproj != 1 || $symproj[0]->[2] != $slice ) {
    my $seq;
    foreach my $segment (@symproj) {
      my $symlink_slice = $segment->[2];

      # Get sequence from each symlinked area.
      $seq .= ${
        $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1,
                                                undef, 1 ) };
    }
    if ( $strand == -1 ) {
      reverse_comp( \$seq );
    }

    return \$seq;
  }

  # We need to project this slice onto the sequence coordinate system
  # even if the slice is in the same coord system, we want to trim out
  # flanking gaps (if the slice is past the edges of the seqregion).
  my $csa      = $self->db->get_CoordSystemAdaptor();
  my $seqlevel = $csa->fetch_sequence_level();

  my @projection =
    @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) };

  my $seq   = '';
  my $total = 0;
  my $tmp_seq;

  # Fetch sequence from each of the sequence regions projected onto.
  foreach my $segment (@projection) {
    my ( $start, $end, $seq_slice ) = @{$segment};

    # Check for gaps between segments and pad them with Ns
    my $gap = $start - $total - 1;
    if ($gap) {
      $seq .= 'N' x $gap;
    }

    my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);

    $tmp_seq = ${
      $self->_fetch_seq( $seq_region_id, $seq_slice->start(),
                         $seq_slice->length() ) };

    # Reverse compliment on negatively oriented slices.
    if ( $seq_slice->strand == -1 ) {
      reverse_comp( \$tmp_seq );
    }

    $seq .= $tmp_seq;

    $total = $end;
  }

  # Check for any remaining gaps at the end.
  my $gap = $slice->length() - $total;

  if ($gap) {
    $seq .= 'N' x $gap;
  }

  # If the sequence is too short it is because we came in with a
  # seqlevel slice that was partially off of the seq_region.  Pad the
  # end with Ns to make long enough
  if ( length($seq) != $slice->length() ) {
    $seq .= 'N' x ( $slice->length() - length($seq) );
  }

  if ( defined( $self->{_rna_edits_cache} )
       && defined(
            $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) )
  {
    $self->_rna_edit( $slice, \$seq );
  }

  return \$seq;
} ## end sub _fetch_by_Slice_start_end_strand_circular




440

441 442 443 444 445
sub _rna_edit {
  my $self  = shift;
  my $slice = shift;
  my $seq   = shift; #reference to string

446 447
  my $s_start = $slice->start;   #substr start at 0 , but seq starts at 1 (so no -1 here)
  my $s_end = $s_start+length($$seq);
448 449 450

  foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
    my ($start, $end, $txt) = split (/\s+/, $edit);
451 452 453 454
# check that RNA edit is not outside the requested region : happens quite often with LRG regions
    next if ($end < $s_start);
    next if ($s_end < $start);
    substr($$seq,$start-$s_start, ($end-$start)+1, $txt);
455 456 457 458
  }
  return;
}

459

460 461 462 463 464 465
sub _fetch_seq {
  my $self          = shift;
  my $seq_region_id = shift;
  my $start         = shift;
  my $length           = shift;

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
  my $cache = $self->{'seq_cache'};

  if($length < $SEQ_CACHE_MAX) {
    my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR;
    my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR;

    # piece together sequence from cached component parts

    my $entire_seq = undef;
    for(my $i = $chunk_min; $i <= $chunk_max; $i++) {
      if($cache->{"$seq_region_id:$i"}) {
        $entire_seq .= $cache->{"$seq_region_id:$i"};
      } else {
        # retrieve uncached portions of the sequence

481 482 483 484
        my $sth =
          $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
                          . "FROM dna d "
                          . "WHERE d.seq_region_id = ?" );
485 486 487 488

        my $tmp_seq;

        my $min = ($i << $SEQ_CHUNK_PWR) + 1;
489

490 491 492
        $sth->bind_param( 1, $min,                SQL_INTEGER );
        $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER );
        $sth->bind_param( 3, $seq_region_id,      SQL_INTEGER );
493 494

        $sth->execute();
495 496 497
        $sth->bind_columns(\$tmp_seq);
        $sth->fetch();
        $sth->finish();
498

499 500
        # always give back uppercased sequence so it can be properly softmasked
        $entire_seq .= uc($tmp_seq);
501
        $cache->{"$seq_region_id:$i"} = uc($tmp_seq);
502 503 504 505
      }
    }

    # return only the requested portion of the entire sequence
506
    my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1;
507
    # my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR;
508
    my $seq = substr( $entire_seq, $start - $min, $length );
509 510 511 512 513

    return \$seq;
  } else {

    # do not do any caching for requests of very large sequences
514 515 516 517
    my $sth =
      $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
                      . "FROM dna d "
                      . "WHERE d.seq_region_id = ?" );
518 519

    my $tmp_seq;
520

521 522 523
    $sth->bind_param( 1, $start,         SQL_INTEGER );
    $sth->bind_param( 2, $length,        SQL_INTEGER );
    $sth->bind_param( 3, $seq_region_id, SQL_INTEGER );
524 525

    $sth->execute();
526 527 528 529
    $sth->bind_columns(\$tmp_seq);
    $sth->fetch();
    $sth->finish();

530 531 532
    # always give back uppercased sequence so it can be properly softmasked
    $tmp_seq = uc($tmp_seq);

533 534
    return \$tmp_seq;
  }
535 536 537
}


538 539
=head2 store

540 541 542 543 544 545
  Arg [1]    : int $seq_region_id the id of the sequence region this dna
               will be associated with.
  Arg [2]    : string $sequence the dna sequence to be stored 
               in the database.  Note that the sequence passed in will be
               converted to uppercase.
  Example    : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
546 547
  Description: stores a dna sequence in the databases dna table and returns the
               database identifier for the new record.
548 549 550
  Returntype : none
  Exceptions : throw if the database insert fails
  Caller     : sequence loading scripts
551
  Status     : Stable
552 553 554 555

=cut

sub store {
556
  my ($self, $seq_region_id, $sequence) = @_;
557

558 559
  if(!$seq_region_id) {
    throw('seq_region_id is required');
560
  }
561

562
  $sequence = uc($sequence);
563

564 565
  my $statement = 
    $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
566

567 568 569
  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
  $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
  $statement->execute();
570

571
  $statement->finish();
572

573
  return;
574 575
}

576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607
=head2 remove

  Arg [1]    : int $seq_region_id the id of the sequence region this dna
               is associated with.
  Example    : $seq_adaptor->remove(11);
  Description: removes a dna sequence for a given seq_region_id
  Returntype : none
  Exceptions : throw if the database delete fails
  Caller     : Internal
  Status     : Stable

=cut

sub remove {
  my ($self, $seq_region_id) = @_;

  if(!$seq_region_id) {
    throw('seq_region_id is required');
  }

  my $statement =
    $self->prepare("DELETE FROM dna WHERE seq_region_id = ?");

  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
  $statement->execute();

  $statement->finish();

  return;
}


608 609 610



611
=head2 fetch_by_assembly_location
612

613
  Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead.
614

615
=cut
616

617 618 619
sub fetch_by_assembly_location {
   my ( $self, $chrStart, $chrEnd, 
        $strand, $chrName, $assemblyType ) = @_;
620

621 622
   deprecate('Use fetch_by_Slice_start_end_strand() instead');

623 624 625
   my $csa = $self->db->get_CoordSystem();
   my $top_cs = @{$csa->fetch_all};

626
   my $slice_adaptor = $self->db->get_SliceAdaptor();
627
   my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName,
628
                                               $chrStart, $chrEnd,
629
                                               $strand, $top_cs->version);
630

631
   return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1);
632 633
}

634

635
=head2 fetch_by_RawContig_start_end_strand
636

637
  Description: DEPRECATED use fetch_by_Slice_start_end_strand instead
638 639 640

=cut

641 642 643
sub fetch_by_RawContig_start_end_strand {
  deprecate('Use fetch_by_Slice_start_end_strand instead.');
  fetch_by_Slice_start_end_strand(@_);
644 645 646
}


647 648


649
1;