SequenceAdaptor.pm 16.4 KB
Newer Older
1
2
=head1 LICENSE

3
  Copyright (c) 1999-2010 The European Bioinformatics Institute and
4
5
6
7
8
9
10
11
12
13
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
14
  developers list at <dev@ensembl.org>.
15
16
17
18
19

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut
20
21
22

=head1 NAME

Graham McVicker's avatar
Graham McVicker committed
23
Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations
24
25
26

=head1 SYNOPSIS

27
28
29
30
  my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' );

  my $dna =
    ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) };
31
32
33

=head1 DESCRIPTION

34
An adaptor for the retrieval of DNA sequence from the EnsEMBL database
35

36
=head1 METHODS
37
38
39
40

=cut

package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
41

42
use vars qw(@ISA @EXPORT);
43
use strict;
44
use warnings;
45

46
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
47
use Bio::EnsEMBL::Utils::Exception qw(throw deprecate);
48
use Bio::EnsEMBL::Utils::Sequence  qw(reverse_comp);
49
use Bio::EnsEMBL::Utils::Cache;
50
use Bio::EnsEMBL::Utils::Scalar qw( assert_ref );
51
52
53

@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);

54
55
our $SEQ_CHUNK_PWR   = 18; # 2^18 = approx. 250KB
our $SEQ_CACHE_SZ    = 5;
56
our $SEQ_CACHE_MAX   = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ;
57

58
@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
59
60
61
62
63
64
65
66
67
68

=head2 new

  Arg [1]    : none
  Example    : my $sa = $db_adaptor->get_SequenceAdaptor();
  Description: Constructor.  Calls superclass constructor and initialises
               internal cache structure.
  Returntype : Bio::EnsEMBL:DBSQL::SequenceAdaptor
  Exceptions : none
  Caller     : DBAdaptor::get_SequenceAdaptor
69
  Status     : Stable
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

=cut

sub new {
  my $caller = shift;

  my $class = ref($caller) || $caller;

  my $self = $class->SUPER::new(@_);

  # use an LRU cache to limit the size
  my %seq_cache;
  tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ);

  $self->{'seq_cache'} = \%seq_cache;

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

#
# See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these
# in a  hash.
#

  my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code like "_rna_edit"');
  
  $sth->execute();
  my ($seq_region_id, $value);
  $sth->bind_columns(\$seq_region_id, \$value);
  my %edits;
  my $count = 0;
  while($sth->fetch()){
    $count++;
    push @{$edits{$seq_region_id}}, $value;
  }
  $sth->finish;
  if($count){
    $self->{_rna_edits_cache} = \%edits;
  }
  
108
109
110
111
  return $self;
}


112
=head2 fetch_by_Slice_start_end_strand
113

Graham McVicker's avatar
Graham McVicker committed
114
115
  Arg  [1]   : Bio::EnsEMBL::Slice slice
               The slice from which you want the sequence
116
117
118
119
120
121
122
123
  Arg  [2]   : (optional) int startBasePair 
               The start base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine.
               default = 1
  Arg  [3]   : (optional) int endBasePair
               The end base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine,
               but the end must be greater than or equal to the start
Graham McVicker's avatar
Graham McVicker committed
124
               count from 1
125
126
               default = the length of the slice
  Arg  [4]   : (optional) int strand 
Graham McVicker's avatar
Graham McVicker committed
127
               1, -1
128
               default = 1
Graham McVicker's avatar
Graham McVicker committed
129
130
131
132
  Example    : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, 
                                                                  1000, -1);
  Description: retrieves from db the sequence for this slice
               uses AssemblyMapper to find the assembly
Graham McVicker's avatar
Graham McVicker committed
133
  Returntype : string 
Graham McVicker's avatar
Graham McVicker committed
134
135
  Exceptions : endBasePair should be less or equal to length of slice 
  Caller     : Bio::EnsEMBL::Slice::seq(), Slice::subseq() 
136
  Status     : Stable
137
138
139

=cut

140
141
sub fetch_by_Slice_start_end_strand {
   my ( $self, $slice, $start, $end, $strand ) = @_;
142

Ian Longden's avatar
Ian Longden committed
143
   if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
144
     throw("Slice argument is required.");
145
146
   }

147
148
   $start = 1 if(!defined($start));

Eugene Kulesha's avatar
Eugene Kulesha committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
   if ($slice->is_circular) {
       if ($start > $end ) {
	   return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand );
       }

       if ($start < 0) {
	   $start += $slice->seq_region_length;
       }
       if ($end < 0) {
	   $end += $slice->seq_region_length;
       }

       if ( !defined($end) ) {
       }
163
164
165
166
167

       if($slice->start> $slice->end) {
	   return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand );
       }

Eugene Kulesha's avatar
Eugene Kulesha committed
168
169
170
171
172
   } else {
       if ( !defined($end) ) {
	   $end = $slice->end() - $slice->start() + 1;
       }
   }
173

174
175
176
  if ( $start > $end ) {
      throw("Start must be less than or equal to end.");
  }
177
178
179
180
181
182

   $strand ||= 1;

   #get a new slice that spans the exact region to retrieve dna from
   my $right_expand  = $end - $slice->length(); #negative is fine
   my $left_expand   = 1 - $start; #negative is fine
183

184
185
   if($right_expand || $left_expand) {
     $slice = $slice->expand($left_expand, $right_expand);
186
   }
187
188
189
190
191
192
193
194
195

   #retrieve normalized 'non-symlinked' slices
   #this allows us to support haplotypes and PARs
   my $slice_adaptor = $slice->adaptor();
   my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};

   if(@symproj == 0) {
     throw('Could not retrieve normalized Slices. Database contains ' .
           'incorrect assembly_exception information.');
196
   }
197
198
199
200
201
202
203
204
205
206
207
208

   #call this method again with any slices that were 'symlinked' to by this
   #slice
   if(@symproj != 1 || $symproj[0]->[2] != $slice) {
     my $seq;
     foreach my $segment (@symproj) {
       my $symlink_slice = $segment->[2];
       #get sequence from each symlinked area
       $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
                                                        1,undef,1)};
     }
     if($strand == -1) {
209
       reverse_comp(\$seq);
210
211
     }
     return \$seq;
212
   }
213

214
215
216
217
218
   # we need to project this slice onto the sequence coordinate system
   # even if the slice is in the same coord system, we want to trim out
   # flanking gaps (if the slice is past the edges of the seqregion)
   my $csa = $self->db->get_CoordSystemAdaptor();
   my $seqlevel = $csa->fetch_sequence_level();
219

220
   my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
221

222
223
224
   my $seq = '';
   my $total = 0;
   my $tmp_seq;
225

226
227
228
   #fetch sequence from each of the sequence regions projected onto
   foreach my $segment (@projection) {
     my ($start, $end, $seq_slice) = @$segment;
229

230
231
232
233
234
     #check for gaps between segments and pad them with Ns
     my $gap = $start - $total - 1;
     if($gap) {
       $seq .= 'N' x $gap;
     }
235

236
237
238
239
     my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);

     $tmp_seq = ${$self->_fetch_seq($seq_region_id,
                                    $seq_slice->start, $seq_slice->length())};
240

241
242
     #reverse compliment on negatively oriented slices
     if($seq_slice->strand == -1) {
243
       reverse_comp(\$tmp_seq);
244
     }
245
246
247
248

     $seq .= $tmp_seq;

     $total = $end;
249
   }
250
251
252
253
254
255
256
257
258
259
260
261
262
263

   #check for any remaining gaps at the end
   my $gap = $slice->length - $total;
   if($gap) {
     $seq .= 'N' x $gap;
   }

   #if the sequence is too short it is because we came in with a seqlevel
   #slice that was partially off of the seq_region.  Pad the end with Ns
   #to make long enough
   if(length($seq) != $slice->length()) {
     $seq .= 'N' x ($slice->length() - length($seq));
   }

264
265
266
267
   if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
     $self->_rna_edit($slice,\$seq);
   }

268
   #if they asked for the negative slice strand revcomp the whole thing
269
   reverse_comp(\$seq) if($strand == -1);
270
271

   return \$seq;
272
273
274
}


275
276
277
278
sub _fetch_by_Slice_start_end_strand_circular {
  my ( $self, $slice, $start, $end, $strand ) = @_;

  assert_ref( $slice, 'Bio::EnsEMBL::Slice' );
279
  
280
281
282
283
284
285
  $strand ||= 1;
  if ( !defined($start) ) {
    $start ||= 1;
  }

  if ( !defined($end) ) {
Eugene Kulesha's avatar
Eugene Kulesha committed
286
      $end = $slice->end() - $slice->start() + 1;
287
288
289
  }

  if ( $start > $end && $slice->is_circular() ) {
290
    my ($seq, $seq1, $seq2);
291

292
293
294
    my $midpoint = $slice->seq_region_length - $slice->start + 1;
    $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1,  $midpoint, 1 )};
    $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )};
Eugene Kulesha's avatar
Eugene Kulesha committed
295

296
    $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1";
297
298
299
300
301
302

    reverse_comp( \$seq ) if ( $strand == -1 );

    return \$seq;
  }

303
304


305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
  # Get a new slice that spans the exact region to retrieve dna from
  my $right_expand = $end - $slice->length();    #negative is fine
  my $left_expand  = 1 - $start;                 #negative is fine

  if ( $right_expand || $left_expand ) {
    $slice =
        $slice->strand > 0
      ? $slice->expand( $left_expand,  $right_expand )
      : $slice->expand( $right_expand, $left_expand );
  }

  # Retrieve normalized 'non-symlinked' slices.  This allows us to
  # support haplotypes and PARs.
  my $slice_adaptor = $slice->adaptor();
  my @symproj =
    @{ $slice_adaptor->fetch_normalized_slice_projection($slice) };

  if ( @symproj == 0 ) {
    throw(   'Could not retrieve normalized Slices. Database contains '
           . 'incorrect assembly_exception information.' );
  }

  # Call this method again with any slices that were 'symlinked' to by
  # this slice.
  if ( @symproj != 1 || $symproj[0]->[2] != $slice ) {
    my $seq;
    foreach my $segment (@symproj) {
      my $symlink_slice = $segment->[2];

      # Get sequence from each symlinked area.
      $seq .= ${
        $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1,
                                                undef, 1 ) };
    }
    if ( $strand == -1 ) {
      reverse_comp( \$seq );
    }

    return \$seq;
  }

  # We need to project this slice onto the sequence coordinate system
  # even if the slice is in the same coord system, we want to trim out
  # flanking gaps (if the slice is past the edges of the seqregion).
  my $csa      = $self->db->get_CoordSystemAdaptor();
  my $seqlevel = $csa->fetch_sequence_level();

  my @projection =
    @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) };

  my $seq   = '';
  my $total = 0;
  my $tmp_seq;

  # Fetch sequence from each of the sequence regions projected onto.
  foreach my $segment (@projection) {
    my ( $start, $end, $seq_slice ) = @{$segment};

    # Check for gaps between segments and pad them with Ns
    my $gap = $start - $total - 1;
    if ($gap) {
      $seq .= 'N' x $gap;
    }

    my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);

    $tmp_seq = ${
      $self->_fetch_seq( $seq_region_id, $seq_slice->start(),
                         $seq_slice->length() ) };

    # Reverse compliment on negatively oriented slices.
    if ( $seq_slice->strand == -1 ) {
      reverse_comp( \$tmp_seq );
    }

    $seq .= $tmp_seq;

    $total = $end;
  }

  # Check for any remaining gaps at the end.
  my $gap = $slice->length() - $total;

  if ($gap) {
    $seq .= 'N' x $gap;
  }

  # If the sequence is too short it is because we came in with a
  # seqlevel slice that was partially off of the seq_region.  Pad the
  # end with Ns to make long enough
  if ( length($seq) != $slice->length() ) {
    $seq .= 'N' x ( $slice->length() - length($seq) );
  }

  if ( defined( $self->{_rna_edits_cache} )
       && defined(
            $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) )
  {
    $self->_rna_edit( $slice, \$seq );
  }

  return \$seq;
} ## end sub _fetch_by_Slice_start_end_strand_circular




412

413
414
415
416
417
sub _rna_edit {
  my $self  = shift;
  my $slice = shift;
  my $seq   = shift; #reference to string

418
419
  my $s_start = $slice->start;   #substr start at 0 , but seq starts at 1 (so no -1 here)
  my $s_end = $s_start+length($$seq);
420
421
422

  foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
    my ($start, $end, $txt) = split (/\s+/, $edit);
423
424
425
426
# check that RNA edit is not outside the requested region : happens quite often with LRG regions
    next if ($end < $s_start);
    next if ($s_end < $start);
    substr($$seq,$start-$s_start, ($end-$start)+1, $txt);
427
428
429
430
431
  }
  return;
}


432
433
434
435
436
437
sub _fetch_seq {
  my $self          = shift;
  my $seq_region_id = shift;
  my $start         = shift;
  my $length           = shift;

438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
  my $cache = $self->{'seq_cache'};

  if($length < $SEQ_CACHE_MAX) {
    my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR;
    my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR;

    # piece together sequence from cached component parts

    my $entire_seq = undef;
    for(my $i = $chunk_min; $i <= $chunk_max; $i++) {
      if($cache->{"$seq_region_id:$i"}) {
        $entire_seq .= $cache->{"$seq_region_id:$i"};
      } else {
        # retrieve uncached portions of the sequence

453
454
455
456
        my $sth =
          $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
                          . "FROM dna d "
                          . "WHERE d.seq_region_id = ?" );
457
458
459
460

        my $tmp_seq;

        my $min = ($i << $SEQ_CHUNK_PWR) + 1;
461

462
463
464
        $sth->bind_param( 1, $min,                SQL_INTEGER );
        $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER );
        $sth->bind_param( 3, $seq_region_id,      SQL_INTEGER );
465
466

        $sth->execute();
467
468
469
        $sth->bind_columns(\$tmp_seq);
        $sth->fetch();
        $sth->finish();
470

471
472
        # always give back uppercased sequence so it can be properly softmasked
        $entire_seq .= uc($tmp_seq);
473
474
475
476
477
        $cache->{"$seq_region_id:$i"} = $tmp_seq;
      }
    }

    # return only the requested portion of the entire sequence
478
    my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1;
479
    # my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR;
480
    my $seq = substr( $entire_seq, $start - $min, $length );
481
482
483
484
485

    return \$seq;
  } else {

    # do not do any caching for requests of very large sequences
486
487
488
489
    my $sth =
      $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
                      . "FROM dna d "
                      . "WHERE d.seq_region_id = ?" );
490
491

    my $tmp_seq;
492

493
494
495
    $sth->bind_param( 1, $start,         SQL_INTEGER );
    $sth->bind_param( 2, $length,        SQL_INTEGER );
    $sth->bind_param( 3, $seq_region_id, SQL_INTEGER );
496
497

    $sth->execute();
498
499
500
501
    $sth->bind_columns(\$tmp_seq);
    $sth->fetch();
    $sth->finish();

502
503
504
    # always give back uppercased sequence so it can be properly softmasked
    $tmp_seq = uc($tmp_seq);

505
506
    return \$tmp_seq;
  }
507
508
509
}


510
511
=head2 store

512
513
514
515
516
517
  Arg [1]    : int $seq_region_id the id of the sequence region this dna
               will be associated with.
  Arg [2]    : string $sequence the dna sequence to be stored 
               in the database.  Note that the sequence passed in will be
               converted to uppercase.
  Example    : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
518
519
  Description: stores a dna sequence in the databases dna table and returns the
               database identifier for the new record.
520
521
522
  Returntype : none
  Exceptions : throw if the database insert fails
  Caller     : sequence loading scripts
523
  Status     : Stable
524
525
526
527

=cut

sub store {
528
  my ($self, $seq_region_id, $sequence) = @_;
529

530
531
  if(!$seq_region_id) {
    throw('seq_region_id is required');
532
  }
533

534
  $sequence = uc($sequence);
535

536
537
  my $statement = 
    $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
538

539
540
541
  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
  $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
  $statement->execute();
542

543
  $statement->finish();
544

545
  return;
546
547
548
549
550
}




551
=head2 fetch_by_assembly_location
552

553
  Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead.
554

555
=cut
556

557
558
559
sub fetch_by_assembly_location {
   my ( $self, $chrStart, $chrEnd, 
        $strand, $chrName, $assemblyType ) = @_;
560

561
562
   deprecate('Use fetch_by_Slice_start_end_strand() instead');

563
564
565
   my $csa = $self->db->get_CoordSystem();
   my $top_cs = @{$csa->fetch_all};

566
   my $slice_adaptor = $self->db->get_SliceAdaptor();
567
   my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName,
568
                                               $chrStart, $chrEnd,
569
                                               $strand, $top_cs->version);
570

571
   return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1);
572
573
}

574

575
=head2 fetch_by_RawContig_start_end_strand
576

577
  Description: DEPRECATED use fetch_by_Slice_start_end_strand instead
578
579
580

=cut

581
582
583
sub fetch_by_RawContig_start_end_strand {
  deprecate('Use fetch_by_Slice_start_end_strand instead.');
  fetch_by_Slice_start_end_strand(@_);
584
585
586
}


587
588


589
1;