SequenceAdaptor.pm 12 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
=head1 LICENSE

  Copyright (c) 1999-2009 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <ensembl-dev@ebi.ac.uk>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut
20 21 22

=head1 NAME

Graham McVicker's avatar
Graham McVicker committed
23
Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations
24 25 26

=head1 SYNOPSIS

27 28 29 30
  my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' );

  my $dna =
    ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) };
31 32 33

=head1 DESCRIPTION

34
An adaptor for the retrieval of DNA sequence from the EnsEMBL database
35

36
=head1 METHODS
37 38 39 40

=cut

package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
41

42
use vars qw(@ISA @EXPORT);
43
use strict;
44
use warnings;
45

46
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
47
use Bio::EnsEMBL::Utils::Exception qw(throw deprecate);
48
use Bio::EnsEMBL::Utils::Sequence  qw(reverse_comp);
49
use Bio::EnsEMBL::Utils::Cache;
50 51 52

@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);

53 54
our $SEQ_CHUNK_PWR   = 18; # 2^18 = approx. 250KB
our $SEQ_CACHE_SZ    = 5;
55
our $SEQ_CACHE_MAX   = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ;
56

57
@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
58 59 60 61 62 63 64 65 66 67

=head2 new

  Arg [1]    : none
  Example    : my $sa = $db_adaptor->get_SequenceAdaptor();
  Description: Constructor.  Calls superclass constructor and initialises
               internal cache structure.
  Returntype : Bio::EnsEMBL:DBSQL::SequenceAdaptor
  Exceptions : none
  Caller     : DBAdaptor::get_SequenceAdaptor
68
  Status     : Stable
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84

=cut

sub new {
  my $caller = shift;

  my $class = ref($caller) || $caller;

  my $self = $class->SUPER::new(@_);

  # use an LRU cache to limit the size
  my %seq_cache;
  tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ);

  $self->{'seq_cache'} = \%seq_cache;

85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106

#
# See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these
# in a  hash.
#

  my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code like "_rna_edit"');
  
  $sth->execute();
  my ($seq_region_id, $value);
  $sth->bind_columns(\$seq_region_id, \$value);
  my %edits;
  my $count = 0;
  while($sth->fetch()){
    $count++;
    push @{$edits{$seq_region_id}}, $value;
  }
  $sth->finish;
  if($count){
    $self->{_rna_edits_cache} = \%edits;
  }
  
107 108 109 110
  return $self;
}


111
=head2 fetch_by_Slice_start_end_strand
112

Graham McVicker's avatar
Graham McVicker committed
113 114
  Arg  [1]   : Bio::EnsEMBL::Slice slice
               The slice from which you want the sequence
115 116 117 118 119 120 121 122
  Arg  [2]   : (optional) int startBasePair 
               The start base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine.
               default = 1
  Arg  [3]   : (optional) int endBasePair
               The end base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine,
               but the end must be greater than or equal to the start
Graham McVicker's avatar
Graham McVicker committed
123
               count from 1
124 125
               default = the length of the slice
  Arg  [4]   : (optional) int strand 
Graham McVicker's avatar
Graham McVicker committed
126
               1, -1
127
               default = 1
Graham McVicker's avatar
Graham McVicker committed
128 129 130 131
  Example    : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, 
                                                                  1000, -1);
  Description: retrieves from db the sequence for this slice
               uses AssemblyMapper to find the assembly
Graham McVicker's avatar
Graham McVicker committed
132
  Returntype : string 
Graham McVicker's avatar
Graham McVicker committed
133 134
  Exceptions : endBasePair should be less or equal to length of slice 
  Caller     : Bio::EnsEMBL::Slice::seq(), Slice::subseq() 
135
  Status     : Stable
136 137 138

=cut

139 140
sub fetch_by_Slice_start_end_strand {
   my ( $self, $slice, $start, $end, $strand ) = @_;
141

142 143
   if(!ref($slice) || !$slice->isa("Bio::EnsEMBL::Slice")) {
     throw("Slice argument is required.");
144 145
   }

146 147 148 149 150 151 152 153 154 155 156
   $start = 1 if(!defined($start));

   $end = $slice->end() - $slice->start() + 1 if(!defined($end));

   throw("Start must be less than or equal to end.") if($start > $end);

   $strand ||= 1;

   #get a new slice that spans the exact region to retrieve dna from
   my $right_expand  = $end - $slice->length(); #negative is fine
   my $left_expand   = 1 - $start; #negative is fine
157

158 159
   if($right_expand || $left_expand) {
     $slice = $slice->expand($left_expand, $right_expand);
160
   }
161 162 163 164 165 166 167 168 169

   #retrieve normalized 'non-symlinked' slices
   #this allows us to support haplotypes and PARs
   my $slice_adaptor = $slice->adaptor();
   my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};

   if(@symproj == 0) {
     throw('Could not retrieve normalized Slices. Database contains ' .
           'incorrect assembly_exception information.');
170
   }
171 172 173 174 175 176 177 178 179 180 181 182

   #call this method again with any slices that were 'symlinked' to by this
   #slice
   if(@symproj != 1 || $symproj[0]->[2] != $slice) {
     my $seq;
     foreach my $segment (@symproj) {
       my $symlink_slice = $segment->[2];
       #get sequence from each symlinked area
       $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
                                                        1,undef,1)};
     }
     if($strand == -1) {
183
       reverse_comp(\$seq);
184 185
     }
     return \$seq;
186
   }
187

188 189 190 191 192
   # we need to project this slice onto the sequence coordinate system
   # even if the slice is in the same coord system, we want to trim out
   # flanking gaps (if the slice is past the edges of the seqregion)
   my $csa = $self->db->get_CoordSystemAdaptor();
   my $seqlevel = $csa->fetch_sequence_level();
193

194
   my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
195

196 197 198
   my $seq = '';
   my $total = 0;
   my $tmp_seq;
199

200 201 202
   #fetch sequence from each of the sequence regions projected onto
   foreach my $segment (@projection) {
     my ($start, $end, $seq_slice) = @$segment;
203

204 205 206 207 208
     #check for gaps between segments and pad them with Ns
     my $gap = $start - $total - 1;
     if($gap) {
       $seq .= 'N' x $gap;
     }
209

210 211 212 213
     my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);

     $tmp_seq = ${$self->_fetch_seq($seq_region_id,
                                    $seq_slice->start, $seq_slice->length())};
214

215 216
     #reverse compliment on negatively oriented slices
     if($seq_slice->strand == -1) {
217
       reverse_comp(\$tmp_seq);
218
     }
219 220 221 222

     $seq .= $tmp_seq;

     $total = $end;
223
   }
224 225 226 227 228 229 230 231 232 233 234 235 236 237

   #check for any remaining gaps at the end
   my $gap = $slice->length - $total;
   if($gap) {
     $seq .= 'N' x $gap;
   }

   #if the sequence is too short it is because we came in with a seqlevel
   #slice that was partially off of the seq_region.  Pad the end with Ns
   #to make long enough
   if(length($seq) != $slice->length()) {
     $seq .= 'N' x ($slice->length() - length($seq));
   }

238 239 240 241
   if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
     $self->_rna_edit($slice,\$seq);
   }

242
   #if they asked for the negative slice strand revcomp the whole thing
243
   reverse_comp(\$seq) if($strand == -1);
244 245

   return \$seq;
246 247 248 249
}



250 251 252 253 254
sub _rna_edit {
  my $self  = shift;
  my $slice = shift;
  my $seq   = shift; #reference to string

255 256
  my $s_start = $slice->start;   #substr start at 0 , but seq starts at 1 (so no -1 here)
  my $s_end = $s_start+length($$seq);
257 258 259

  foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
    my ($start, $end, $txt) = split (/\s+/, $edit);
260 261 262 263
# check that RNA edit is not outside the requested region : happens quite often with LRG regions
    next if ($end < $s_start);
    next if ($s_end < $start);
    substr($$seq,$start-$s_start, ($end-$start)+1, $txt);
264 265 266 267 268
  }
  return;
}


269 270 271 272 273 274
sub _fetch_seq {
  my $self          = shift;
  my $seq_region_id = shift;
  my $start         = shift;
  my $length           = shift;

275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
  my $cache = $self->{'seq_cache'};

  if($length < $SEQ_CACHE_MAX) {
    my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR;
    my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR;

    # piece together sequence from cached component parts

    my $entire_seq = undef;
    for(my $i = $chunk_min; $i <= $chunk_max; $i++) {
      if($cache->{"$seq_region_id:$i"}) {
        $entire_seq .= $cache->{"$seq_region_id:$i"};
      } else {
        # retrieve uncached portions of the sequence

290 291 292 293
        my $sth =
          $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
                          . "FROM dna d "
                          . "WHERE d.seq_region_id = ?" );
294 295 296 297

        my $tmp_seq;

        my $min = ($i << $SEQ_CHUNK_PWR) + 1;
298

299 300 301
        $sth->bind_param( 1, $min,                SQL_INTEGER );
        $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER );
        $sth->bind_param( 3, $seq_region_id,      SQL_INTEGER );
302 303

        $sth->execute();
304 305 306
        $sth->bind_columns(\$tmp_seq);
        $sth->fetch();
        $sth->finish();
307

308 309
        # always give back uppercased sequence so it can be properly softmasked
        $entire_seq .= uc($tmp_seq);
310 311 312 313 314
        $cache->{"$seq_region_id:$i"} = $tmp_seq;
      }
    }

    # return only the requested portion of the entire sequence
315 316 317
    my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1;
    my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR;
    my $seq = substr( $entire_seq, $start - $min, $length );
318 319 320 321 322

    return \$seq;
  } else {

    # do not do any caching for requests of very large sequences
323 324 325 326
    my $sth =
      $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
                      . "FROM dna d "
                      . "WHERE d.seq_region_id = ?" );
327 328

    my $tmp_seq;
329

330 331 332
    $sth->bind_param( 1, $start,         SQL_INTEGER );
    $sth->bind_param( 2, $length,        SQL_INTEGER );
    $sth->bind_param( 3, $seq_region_id, SQL_INTEGER );
333 334

    $sth->execute();
335 336 337 338
    $sth->bind_columns(\$tmp_seq);
    $sth->fetch();
    $sth->finish();

339 340 341
    # always give back uppercased sequence so it can be properly softmasked
    $tmp_seq = uc($tmp_seq);

342 343
    return \$tmp_seq;
  }
344 345 346
}


347 348
=head2 store

349 350 351 352 353 354
  Arg [1]    : int $seq_region_id the id of the sequence region this dna
               will be associated with.
  Arg [2]    : string $sequence the dna sequence to be stored 
               in the database.  Note that the sequence passed in will be
               converted to uppercase.
  Example    : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
355 356
  Description: stores a dna sequence in the databases dna table and returns the
               database identifier for the new record.
357 358 359
  Returntype : none
  Exceptions : throw if the database insert fails
  Caller     : sequence loading scripts
360
  Status     : Stable
361 362 363 364

=cut

sub store {
365
  my ($self, $seq_region_id, $sequence) = @_;
366

367 368
  if(!$seq_region_id) {
    throw('seq_region_id is required');
369
  }
370

371
  $sequence = uc($sequence);
372

373 374
  my $statement = 
    $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
375

376 377 378
  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
  $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
  $statement->execute();
379

380
  $statement->finish();
381

382
  return;
383 384 385 386 387
}




388
=head2 fetch_by_assembly_location
389

390
  Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead.
391

392
=cut
393

394 395 396
sub fetch_by_assembly_location {
   my ( $self, $chrStart, $chrEnd, 
        $strand, $chrName, $assemblyType ) = @_;
397

398 399
   deprecate('Use fetch_by_Slice_start_end_strand() instead');

400 401 402
   my $csa = $self->db->get_CoordSystem();
   my $top_cs = @{$csa->fetch_all};

403
   my $slice_adaptor = $self->db->get_SliceAdaptor();
404
   my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName,
405
                                               $chrStart, $chrEnd,
406
                                               $strand, $top_cs->version);
407

408
   return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1);
409 410
}

411

412
=head2 fetch_by_RawContig_start_end_strand
413

414
  Description: DEPRECATED use fetch_by_Slice_start_end_strand instead
415 416 417

=cut

418 419 420
sub fetch_by_RawContig_start_end_strand {
  deprecate('Use fetch_by_Slice_start_end_strand instead.');
  fetch_by_Slice_start_end_strand(@_);
421 422 423
}


424 425


426
1;