Commit 38b2e251 authored by Magali Ruffier's avatar Magali Ruffier
Browse files

Revert "Merge pull request #72 from Ensembl/experimental/alternative_sequence_stores"

This reverts commit b6a99d4a, reversing
changes made to ac9e1dc1.

Conflicts:

	modules/Bio/EnsEMBL/DBSQL/BaseSequenceAdaptor.pm
parent fe19c3d2
#!/usr/bin/env perl
# Copyright [1999-2014] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
use File::Basename;
use Bio::EnsEMBL::Utils::IO::FileFaidx;
my $current_time = time();
my $file = $ARGV[0];
my $faidx = Bio::EnsEMBL::Utils::IO::FileFaidx->new($file, 'write_index');
my $index = $faidx->index_path($file);
if(-f $index) {
print "Removing the existing index at $index\n";
unlink $index;
}
print "Generating the index\n";
$faidx->lookup();
my $finished_time = time();
my $elapsed = $finished_time - $current_time;
my $time = gmtime($finished_time);
print "Finished @ $time. Took $elapsed second(s)\n";
\ No newline at end of file
=head1 LICENSE
Copyright [1999-2014] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=cut
=head1 NAME
Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor
=head1 DESCRIPTION
The BaseSequenceAdaptor is responsible for the conversion of calls from
C<fetch_by_Slice_start_end_strand()> for Sequence data into requests for a
backing data store. In Ensembl these are the B<seqlevel> sequence region
records held in the MySQL database.
The base adaptor also provides sequence caching based on normalisation
technique similar to the UCSC and BAM binning indexes. The code works
by right-shifting the requested start and end by a seq chunk power
(by default 18 approx. 250,000bp) and then left-shifting by the same
value. This means any value within a given window will always result in
the same value. Please see the worked examples below:
# Equation
p=position
o=seq chunk power
offset=( (p-1)>>o ) << o
# Using real values
p=1340001
o=18
right_shifted = (1340001-1) >> 18 == 5
offset = 5 << 18 == 1310720
To control the size of the cache and sequences stored you can provide
the seq chunk power and the number of sequences cached.
=cut
package Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Cache;
use Bio::EnsEMBL::Utils::Exception qw(throw);
our $SEQ_CHUNK_PWR = 18; # 2^18 = approx. 250KB
our $SEQ_CACHE_SIZE = 5;
=head2 fetch_by_Slice_start_end_strand
Arg [1] : Bio::EnsEMBL::Slice slice
The slice from which you want the sequence
Arg [2] : Integer; $strand (optional)
The start base pair relative to the start of the slice. Negative
values or values greater than the length of the slice are fine.
default = 1
Arg [3] : (optional) int endBasePair
The end base pair relative to the start of the slice. Negative
values or values greater than the length of the slice are fine,
but the end must be greater than or equal to the start
count from 1
default = the length of the slice
Arg [4] : Integer; $strand (optional)
Strand of DNA to fetch
Returntype : StringRef (DNA requested)
Description: Performs the fetching of DNA based upon a Slice. All fetches
should use this method and no-other.
Implementing classes are responsible for converting the
given Slice and values into something which can be processed by
the underlying storage engine. Implementing class are also
responsible for the reverse complementing of sequence.
Exceptions : Thrown if not redefined
=cut
sub fetch_by_Slice_start_end_strand {
my ( $self, $slice, $start, $end, $strand ) = @_;
throw "fetch_by_Slice_start_end_strand() must be implemented";
}
=head2 can_access_Slice
Description : Returns a boolean indiciating if the adaptor understands
the given Slice.
Returntype : Boolean; if true you can get sequence for the given Slice
Exceptions : Thrown if not redefined
=cut
sub can_access_Slice {
my ($self, $slice) = @_;
throw "can_access_Slice() must be implemented";
}
=head2 expand_Slice
Arg [1] : Bio::EnsEMBL::Slice slice
The slice from which you want the sequence
Arg [2] : Integer; $strand (optional)
The start base pair relative to the start of the slice. Negative
values or values greater than the length of the slice are fine.
default = 1
Arg [3] : (optional) int endBasePair
The end base pair relative to the start of the slice. Negative
values or values greater than the length of the slice are fine,
but the end must be greater than or equal to the start
count from 1
default = the length of the slice
Arg [4] : Integer; $strand (optional)
Strand of DNA to fetch
Returntype : Bio::EnsEMBL::Slice
Description : Creates a new Slice which represents the requested region. Provides
logic applicable to all SliceAdaptor instance
Exceptions : Thrown if the Slice is circular (we currently do not support this as generic logic)
=cut
sub expand_Slice {
my ($self, $slice, $start, $end, $strand) = @_;
$start = 1 if ! defined $start;
$end = ($slice->end() - $slice->start()) + 1 if ! defined $end;
$strand = 1 if ! defined $strand;
my $right_expand = $end - $slice->length(); #negative is fine
my $left_expand = 1 - $start; #negative is fine
my $new_slice = $slice;
if ( $right_expand || $left_expand ) {
$new_slice =
$slice->strand > 0
? $slice->expand( $left_expand, $right_expand )
: $slice->expand( $right_expand, $left_expand );
}
return $new_slice;
}
=head2 new
Arg [1] : Int $chunk_power; sets the size of each element of
the sequence cache. Defaults to 18 which gives
block sizes of ~250Kb (it is actually 2^18)
Arg [2] : Int $cache_size; size of the cache. Defaults to 5 meaning
a cache of 1Mb if you use default values
Example : my $sa = $db_adaptor->get_SequenceAdaptor();
Description: Constructor. Calls superclass constructor and initialises
internal cache structure.
Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor
Exceptions : none
Caller : DBAdaptor::get_SequenceAdaptor
Status : Stable
=cut
sub new {
my ($class, $chunk_power, $cache_size) = @_;
$class = ref($class) || $class;
my $self = bless({}, $class);
$self->_init_seq_instance($chunk_power, $cache_size);
return $self;
}
sub _init_seq_instance {
my ($self, $chunk_power, $cache_size) = @_;
$chunk_power ||= $SEQ_CHUNK_PWR;
$cache_size ||= $SEQ_CACHE_SIZE;
# use a LRU cache to limit the size
my $cache = Bio::EnsEMBL::Utils::Cache->TIEHASH($cache_size);
$self->{cache_size} = $cache_size;
$self->{chunk_power} = $chunk_power;
$self->{seq_cache_max} = ((2 ** $chunk_power) * $cache_size);
$self->{seq_cache} = $cache;
return;
}
=head2 clear_cache
Example : $sa->clear_cache();
Description : Removes all entries from the associcated sequence cache
Returntype : None
Exceptions : None
=cut
sub clear_cache {
my ($self) = @_;
$self->{seq_cache}->CLEAR();
return;
}
sub chunk_power {
my ($self) = @_;
return $self->{chunk_power};
}
sub cache_size {
my ($self) = @_;
return $self->{cache_size};
}
sub seq_cache_max {
my ($self) = @_;
return $self->{seq_cache_max};
}
=head2 _fetch_raw_seq
Arg [1] : String $id
The identifier of the sequence to fetch.
Arg [2] : Integer $start
Where to start fetching sequence from
Arg [2] : Integer $length
Total length of seuqence to fetch
Description : Performs the fetch of DNA from the backing storage
engine and provides it to the _fetch_seq() method
for optional caching.
Returntype : ScalarRef of DNA fetched. All bases should be uppercased
Exceptions : Thrown if the method is not reimplemented
=cut
sub _fetch_raw_seq {
my ($self, $id, $start, $length) = @_;
throw "Need to implement _fetch_raw_seq(). Code must return a ScalarRef";
}
=head2 _fetch_seq
Arg [1] : String $id
The identifier of the sequence to fetch.
Arg [2] : Integer $start
Where to start fetching sequence from
Arg [2] : Integer $length
Total length of seuqence to fetch
Description : If the requested region is smaller than our maximum length
cachable region we will see if the cache already contains
this chunk. If not we will request the region from C<_fetch_raw_seq()>
and cache it. If the region requested is larger than
the maximum cacheable sequence length we pass the request
onto C<_fetch_raw_seq()> with no caching layer.
This module is also responsible for the conversion of
requested regions into normalised region reuqests based
on C<chunk_power>.
Returntype : ScalarRef of DNA fetched. All bases should be uppercased
Exceptions : Thrown when C<_fetch_raw_seq()> is not re-implemented
=cut
sub _fetch_seq {
my ($self, $id, $start, $length) = @_;
my $cache = $self->{'seq_cache'};
my $cache_size = $self->cache_size();
my $seq_chunk_pwr = $self->chunk_power();
my $seq_cache_max = $self->seq_cache_max();
my $seq_ref;
# If the length of the region is less than the max size of the
# cache then we can use the cache. The region is converted into
# a min to max range of bins we have to query to get all the
# required sequence. Of these bins the cache may have all of them,
# none of them or a combination of the two states
if($length < $seq_cache_max) {
my $chunk_min = ($start-1) >> $seq_chunk_pwr;
my $chunk_max = ($start + $length - 1) >> $seq_chunk_pwr;
# piece together sequence from cached component parts
my $entire_seq = q{};
for(my $i = $chunk_min; $i <= $chunk_max; $i++) {
my $key = "${id}:${i}";
#If it exists within the cache then add to the string. We will trim
#down to the requested region later on
my $cached_seq_ref = $cache->FETCH($key);
if($cached_seq_ref) {
$entire_seq .= ${$cached_seq_ref};
}
#Otherwise it is uncached so fetch, concat and store in the cache
else {
my $min = ($i << $seq_chunk_pwr) + 1;
my $length = 1 << $seq_chunk_pwr;
my $tmp_seq_ref = $self->_fetch_raw_seq($id, $min, $length);
$entire_seq .= ${$tmp_seq_ref};
$cache->STORE($key, $tmp_seq_ref);
}
}
# now return only the requested portion of the entire sequence
my $min = ( $chunk_min << $seq_chunk_pwr ) + 1;
my $seq = substr( $entire_seq, $start - $min, $length );
$seq_ref = \$seq;
}
else {
# If it was too big then just ask it from the backing store. There's
# no use in caching this
$seq_ref = $self->_fetch_raw_seq($id, $start, $length);
}
return $seq_ref;
}
1;
=head1 LICENSE
Copyright [1999-2014] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
=cut
=head1 NAME
Bio::EnsEMBL::DBSQL::FastaSequenceAdaptor
=head1 DESCRIPTION
This sequence adaptor extends the BaseSequenceAdaptor code to provide an
implementation of the .fai index lookup as defined by samtools. The code uses
this indexing system to access portions of sequence and translates Slice requests
into sensible locations for our FASTA query layer.
The adaptor must be initalised with access to a Faidx compatible object and the FASTA
file backing must use the same seq_region_name as the querying slices otherwise
we cannot return the required data.
=cut
package Bio::EnsEMBL::DBSQL::FastaSequenceAdaptor;
use strict;
use warnings;
use base qw/Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor/;
use Bio::EnsEMBL::Utils::Exception qw/throw/;
use Bio::EnsEMBL::Utils::Scalar qw/assert_ref/;
use Bio::EnsEMBL::Utils::Sequence qw/reverse_comp/;
use English qw/-no_match_vars/;
require bytes;
=head2 new
Arg [1] : FileFaidx; $faindex. A FileFaidx object or a compatible version
Arg [2] : Integer; $chunk_power. Size of the region to cache
Arg [3] : Integer; $cache_size. Number of regions to cache
Description : Builds an instance of the FastaSequenceAdaptor
=cut
sub new {
my ($class, $faindex, $chunk_power, $cache_size) = @_;
my $self = $class->SUPER::new($chunk_power, $cache_size);
$self->faindex($faindex);
return $self;
}
=head2 fetch_by_Slice_start_end_strand
Arg [1] : Bio::EnsEMBL::Slice; $slice. Slice to fetch sequence for
Arg [2] : Integer; $start. Start of region to retrieve relative to the Slice (defaults to 1)
Arg [3] : Integer; $end. End of region to retreive relative to the Slice (defaults to length)
Arg [4] : Integer; $strand. Strand to fetch (defaults to 1)
Description : Fetches sequence for the given slice. Unlike the normal SequenceAdaptor we assume
Sequence is held in a FASTA file under the Slice's seq_region_name.
Exception : Thrown if we are given a circular slice
=cut
sub fetch_by_Slice_start_end_strand {
my ( $self, $slice, $start, $end, $strand ) = @_;
# Input checks
assert_ref($slice, 'Bio::EnsEMBL::Slice', 'slice');
if(defined $end && $start > $end && $slice->is_circular()) {
throw "Currently we do not support circular requests";
}
#Get a new slice that spans the exact region to retrieve dna from.
#Then constrain to seq region if it's gone negative or over the end
$slice = $self->expand_Slice($slice, $start, $end, $strand);
$slice = $slice->constrain_to_seq_region();
# This call is likely to barf if we try to query using a chr name
# we do not understand. Use can_access_Slice() to make sure
my $seq_ref = $self->_fetch_seq($slice->seq_region_name(), $slice->start(), $slice->length());
reverse_comp($seq_ref) if $strand == -1;
return $seq_ref;
}
=head2 faindex
Description : Holds a reference to the Faindex object to use for sequence access
=cut
sub faindex {
my ($self, $faindex) = @_;
if(defined $faindex) {
assert_ref($faindex, 'Bio::EnsEMBL::Utils::IO::FileFaidx', 'faidx');
$self->{faindex} = $faindex;
}
return $self->{faindex};
}
=head2 can_access_Slice
Description : Checks the lookup to see if we have access to the Slice given (using
seq region name as the ID). We reject any Circular Slice
=cut
sub can_access_Slice {
my ($self, $slice) = @_;
return 0 if $slice->is_circular();
return $self->faindex()->can_access_id($slice->seq_region_name());
}
=head2 store
Description : Unsupported operation. Please use a FASTA serialiser
=cut
sub store {
throw "Unsupported operation. Cannot store sequence in a fasta file";
}
=head _fetch_raw_seq
Description : Provides access to the underlying faindex object and returns a sequence scalar ref
=cut
sub _fetch_raw_seq {
my ($self, $id, $start, $length) = @_;
my $seq_ref = $self->faindex()->fetch_seq($id, $start, $length);
return $seq_ref;
}
1;
......@@ -14,13 +14,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <dev@ensembl.org>.
developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
Questions may also be sent to the Ensembl help desk at
<helpdesk@ensembl.org>.
<http://www.ensembl.org/Help/Contact>.
=cut
......@@ -45,21 +48,29 @@ An adaptor for the retrieval of DNA sequence from the EnsEMBL database
package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
use vars qw(@ISA @EXPORT);
use strict;
use warnings;
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use Bio::EnsEMBL::Utils::Exception qw(throw deprecate);
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Utils::Cache;
use Bio::EnsEMBL::Utils::Scalar qw( assert_ref );
use DBI qw/:sql_types/;
use base qw(Bio::EnsEMBL::DBSQL::BaseAdaptor Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor);
our @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
our $SEQ_CHUNK_PWR = 18; # 2^18 = approx. 250KB
our $SEQ_CACHE_SZ = 5;
our $SEQ_CACHE_MAX = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ;
@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
=head2 new
Arg [1] : none
Example : my $sa = $db_adaptor->get_SequenceAdaptor();
Description: Constructor. Calls superclass constructor and initialises
Description: Constructor. Calls superclass constructor and initialises
internal cache structure.
Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor
Exceptions : none
......@@ -69,14 +80,64 @@ our @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
=cut
sub new {
my ($caller, $db, $chunk_power, $cache_size) = shift;
my $caller = shift;
my $class = ref($caller) || $caller;
my $self = $class->SUPER::new(@_);
$self->_init_seq_instance($chunk_power, $cache_size);
$self->_populate_seq_region_edits();
# use an LRU cache to limit the size
my %seq_cache;
tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ);
$self->{'seq_cache'} = \%seq_cache;
#
# See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these
# in a hash.
#
my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code = "_rna_edit"');
$sth->execute();
my ($seq_region_id, $value);
$sth->bind_columns(\$seq_region_id, \$value);
my %edits;
my $count = 0;
while($sth->fetch()){
$count++;
my ($start, $end, $substring) = split (/\s+/, $value);