Commit 4a405d74 authored by Arne Stabenau's avatar Arne Stabenau
Browse files

cetralized all seq_region caching

parent f2cb6af6
......@@ -69,6 +69,7 @@ use Bio::EnsEMBL::TopLevelAssemblyMapper;
use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache
use Bio::EnsEMBL::Utils::Exception qw(deprecate throw);
use Bio::EnsEMBL::Utils::SeqRegionCache;
use integer; #do proper arithmetic bitshifts
......@@ -79,8 +80,6 @@ my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6
# if the mapper is bigger than that its flushed before registering new stuff:
my $MAX_PAIR_COUNT = 1000;
#number of seq regions to remember ids fo
my $SEQ_REGION_CACHE_SIZE = 2500;
=head2 new
......@@ -101,10 +100,6 @@ sub new {
$self->{'_asm_mapper_cache'} = {};
my %cache;
tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE);
$self->{'_sr_id_cache'} = \%cache;
return $self;
}
......@@ -319,6 +314,7 @@ sub register_assembled {
asm.cmp_end,
asm.cmp_seq_region_id,
sr.name,
sr.length,
asm.ori,
asm.asm_start,
asm.asm_end
......@@ -339,10 +335,11 @@ sub register_assembled {
$sth->execute($asm_seq_region_id, $region_start, $region_end, $cmp_cs_id);
my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori,
$asm_start, $asm_end);
$asm_start, $asm_end, $cmp_seq_region_length);
$sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
\$cmp_seq_region, \$ori, \$asm_start, \$asm_end);
\$cmp_seq_region, \$cmp_seq_region_length, \$ori,
\$asm_start, \$asm_end);
#
# Load the unregistered regions of the mapper
......@@ -354,8 +351,12 @@ sub register_assembled {
$asm_seq_region, $asm_start, $asm_end,
$ori,
$cmp_seq_region, $cmp_start, $cmp_end);
$self->{'_sr_id_cache'}->{"$cmp_seq_region:$cmp_cs_id"} =
$cmp_seq_region_id;
my $arr = [ $cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_seq_region_length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$cmp_seq_region:$cmp_cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$cmp_seq_region_id"} =
$arr;
}
}
......@@ -372,15 +373,16 @@ sub _seq_region_name_to_id {
($sr_name && $cs_id) || throw('seq_region_name and coord_system_id args ' .
'are required');
my $sr_id = $self->{'_sr_id_cache'}->{"$sr_name:$cs_id"};
return $sr_id if($sr_id);
my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$sr_name:$cs_id"};
if( $arr ) {
return $arr->[0];
}
# Get the seq_region_id via the name. This would be quicker if we just
# used internal ids instead but stored but then we lose the ability
# the transform accross databases with different internal ids
my $sth = $self->prepare("SELECT seq_region_id " .
my $sth = $self->prepare("SELECT seq_region_id, length " .
"FROM seq_region " .
"WHERE name = ? AND coord_system_id = ?");
......@@ -391,10 +393,15 @@ sub _seq_region_name_to_id {
"in coord system $cs_id");
}
($sr_id) = $sth->fetchrow_array();
my ($sr_id, $sr_length) = $sth->fetchrow_array();
$sth->finish();
$self->{'_sr_id_cache'}->{"$sr_name:$cs_id"} = $sr_id;
$arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$sr_name:$cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$sr_id"} =
$arr;
return $sr_id;
}
......@@ -442,7 +449,8 @@ sub register_component {
asm.asm_start,
asm.asm_end,
asm.asm_seq_region_id,
sr.name
sr.name,
sr.length
FROM
assembly asm, seq_region sr
WHERE
......@@ -466,10 +474,15 @@ sub register_component {
"component region cmp_seq_region_id=[$cmp_seq_region_id]");
}
my ($asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region) =
my ($asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_seq_region_length) =
$sth->fetchrow_array();
$self->{'_sr_id_cache'}->{"$asm_seq_region:$asm_cs_id"} = $asm_seq_region_id;
my $arr = [ $asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_seq_region_length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$asm_seq_region:$asm_cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$asm_seq_region_id"} =
$arr;
$sth->finish();
......@@ -574,6 +587,7 @@ sub register_chained {
asm.cmp_end,
asm.cmp_seq_region_id,
sr.name,
sr.length,
asm.ori,
asm.asm_start,
asm.asm_end
......@@ -593,6 +607,7 @@ sub register_chained {
asm.asm_end,
asm.asm_seq_region_id,
sr.name,
sr.length,
asm.ori,
asm.cmp_start,
asm.cmp_end
......@@ -655,11 +670,11 @@ sub register_chained {
#load the start <-> mid mapper with the results and record the mid cs
#ranges we just added to the mapper
my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region,
my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
$ori, $start_start, $start_end);
$sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
\$mid_seq_region, \$ori, \$start_start,
\$mid_seq_region, \$mid_length, \$ori, \$start_start,
\$start_end);
while($sth->fetch()) {
......@@ -686,8 +701,12 @@ sub register_chained {
}
#update sr_name cache
$self->{'_sr_id_cache'}->{"$mid_seq_region:$mid_cs_id"} =
$mid_seq_region_id;
my $arr = [ $mid_seq_region_id, $mid_seq_region, $mid_cs_id, $mid_length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$mid_seq_region:$mid_cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$mid_seq_region_id"} =
$arr;
push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
$mid_start,$mid_end];
......@@ -748,11 +767,11 @@ sub register_chained {
#load the end <-> mid mapper with the results and record the mid cs
#ranges we just added to the mapper
my ($end_start, $end_end, $end_seq_region_id, $end_seq_region,
my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length,
$ori, $mid_start, $mid_end);
$sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
\$end_seq_region, \$ori, \$mid_start,
\$end_seq_region, \$end_length, \$ori, \$mid_start,
\$mid_end);
while($sth->fetch()) {
......@@ -766,8 +785,12 @@ sub register_chained {
);
#update sr_name cache
$self->{'_sr_id_cache'}->{"$end_seq_region:$end_cs_id"} =
$end_seq_region_id;
my $arr = [ $end_seq_region_id, $end_seq_region, $end_cs_id, $end_length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$end_seq_region:$end_cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$end_seq_region_id"} =
$arr;
#register this region on the end coord system
$end_registry->check_and_register($end_seq_region, $end_start, $end_end);
......@@ -852,7 +875,6 @@ sub register_chained {
sub deleteObj {
my $self = shift;
delete $self->{'_asm_mapper_cache'};
$self->SUPER::deleteObj();
}
......@@ -884,9 +906,12 @@ sub seq_regions_to_ids {
my @out;
foreach my $sr (@$seq_regions) {
my $id = $self->{'_sr_id_cache'}->{"$sr:$cs_id"};
$id = $self->_seq_region_name_to_id($sr,$cs_id) if(!$id);
push @out, $id;
my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$sr:$cs_id"};
if( $arr ) {
push( @out, $arr->[0] );
} else {
push @out, $self->_seq_region_name_to_id($sr,$cs_id);
}
}
return \@out;
......
......@@ -95,10 +95,10 @@ use Bio::EnsEMBL::Mapper;
use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning);
use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache
use Bio::EnsEMBL::Utils::SeqRegionCache;
@ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor');
my $SEQ_REGION_CACHE_SIZE = 1000;
@ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor');
sub new {
my $caller = shift;
......@@ -107,14 +107,6 @@ sub new {
my $self = $class->SUPER::new(@_);
my %name_cache;
my %id_cache;
tie(%name_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE);
tie(%id_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE);
$self->{'_name_cache'} = \%name_cache;
$self->{'_id_cache'} = \%id_cache;
return $self;
}
......@@ -170,6 +162,10 @@ sub new {
=cut
#
# ARNE: This subroutine needs simplification!!
#
sub fetch_by_region {
my ($self, $coord_system_name, $seq_region_name,
$start, $end, $strand, $version) = @_;
......@@ -209,7 +205,7 @@ sub fetch_by_region {
$constraint = "sr.coord_system_id = ?";
$key = lc(join(':',$seq_region_name,$cs->name(), $cs->version));
$key = "$seq_region_name:".$cs->dbID();
} else {
$sql = "SELECT sr.name, sr.seq_region_id, sr.length, " .
" cs.coord_system_id " .
......@@ -225,10 +221,13 @@ sub fetch_by_region {
#check the cache so we only go to the db if necessary
my $length;
my $name_cache = $self->{'_name_cache'};
my $arr;
if( $key ) {
$arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{$key};
}
if($key && exists($name_cache->{$key})) {
$length = $name_cache->{$key}->[1];
if( $arr ) {
$length = $arr->[3];
} else {
my $sth = $self->prepare($sql . " WHERE sr.name = ? AND " .
$constraint);
......@@ -261,9 +260,11 @@ sub fetch_by_region {
my $tmp_cs = ($cs) ? $cs : $csa->fetch_by_dbID($cs_id);
# cache values for future reference
$key = lc(join(':',$tmp_name,$tmp_cs->name(),$tmp_cs->version));
$name_cache->{$key} = [$id,$tmp_length];
$self->{'_id_cache'}->{$id} = [$tmp_name,$tmp_length,$high_cs];
my $arr = [ $id, $tmp_name, $cs_id, $tmp_length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$tmp_name:$cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$id"} =
$arr;
my $tmp_ver = substr($tmp_name, $prefix_len);
......@@ -291,15 +292,15 @@ sub fetch_by_region {
my ($id, $cs_id);
($seq_region_name, $id, $length, $cs_id) = $sth->fetchrow_array();
$sth->finish();
if(!$cs) {
$cs = $csa->fetch_by_dbID($cs_id);
$key = lc(join(':',$seq_region_name,$cs->name(), $cs->version));
}
#cache results to speed up future queries
$name_cache->{$key} = [$id,$length];
$self->{'_id_cache'}->{$id} = [$seq_region_name, $length, $cs];
# cahce to speed up for future queries
my $arr = [ $id, $seq_region_name, $cs_id, $length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$seq_region_name:$cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$id"} =
$arr;
$cs = $csa->fetch_by_dbID( $cs_id );
}
}
......@@ -384,12 +385,13 @@ sub fetch_by_name {
sub fetch_by_seq_region_id {
my ($self, $seq_region_id) = @_;
my $id_cache = $self->{'_id_cache'};
my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{ $seq_region_id };
my ($name, $length, $cs);
if(exists $id_cache->{$seq_region_id}) {
($name, $length, $cs) = @{$id_cache->{$seq_region_id}};
if( $arr ) {
my $cs_id;
($name, $cs_id, $length ) = ( $arr->[1], $arr->[2], $arr->[3] );
$cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id);
} else {
my $sth = $self->prepare("SELECT name, length, coord_system_id " .
"FROM seq_region " .
......@@ -406,9 +408,12 @@ sub fetch_by_seq_region_id {
$cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id);
#cache results to speed up repeated queries
$id_cache->{$seq_region_id} = [$name, $length, $cs];
my $key = lc(join(':', $name, $cs->name, $cs->version));
$self->{'_name_cache'}->{$key} = [$seq_region_id, $length];
my $arr = [ $seq_region_id, $name, $cs_id, $length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$name:$cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$seq_region_id"} =
$arr;
}
return Bio::EnsEMBL::Slice->new(-COORD_SYSTEM => $cs,
......@@ -445,44 +450,42 @@ sub get_seq_region_id {
if(!$slice || !ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) {
throw('Slice argument is required');
}
my $cs_name = $slice->coord_system->name();
my $cs_version = $slice->coord_system->version();
my $seq_region_name = $slice->seq_region_name();
my $key = $seq_region_name.":".$slice->coord_system->dbID();
my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$key"};
my $key = lc(join(':', $seq_region_name,$cs_name,$cs_version));
my $name_cache = $self->{'_name_cache'};
if(exists($name_cache->{$key})) {
return $name_cache->{$key}->[0];
if( $arr ) {
return $arr->[0];
}
my $csa = $self->db()->get_CoordSystemAdaptor();
my $coord_system = $csa->fetch_by_name($cs_name,$cs_version);
my $cs_id = $slice->coord_system->dbID();
my $sth = $self->prepare("SELECT seq_region_id, length " .
"FROM seq_region " .
"WHERE name = ? AND coord_system_id = ?");
#force seq_region_name cast to string so mysql cannot treat as int
$sth->execute("$seq_region_name", $coord_system->dbID());
$sth->execute("$seq_region_name", $cs_id );
if($sth->rows() != 1) {
throw("Non existant or ambigous seq_region:\n" .
" coord_system=[$cs_name],\n" .
" name=[$seq_region_name],\n" .
" version=[$cs_version]");
" coord_system=[$cs_id],\n" .
" name=[$seq_region_name],\n");
}
my($seq_region_id, $length) = $sth->fetchrow_array();
$sth->finish();
#cache information for future requests
$name_cache->{$key} = [$seq_region_id, $length];
$self->{'_id_cache'}->{$seq_region_id} =
[$seq_region_name, $length, $coord_system];
$arr = [ $seq_region_id, $seq_region_name, $cs_id, $length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$seq_region_name:$cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$seq_region_id"} =
$arr;
return $seq_region_id;
}
......@@ -592,8 +595,6 @@ sub fetch_all {
my ($seq_region_id, $name, $length, $cs_id);
$sth->bind_columns(\$seq_region_id, \$name, \$length, \$cs_id);
my $name_cache = $self->{'_name_cache'};
my $id_cache = $self->{'_id_cache'};
my $cache_count = 0;
my @out;
......@@ -605,14 +606,16 @@ sub fetch_all {
throw("seq_region $name references non-existent coord_system $cs_id.");
}
my $cs_key = lc($cs->name().':'.$cs_version);
#cache values for future reference, but stop adding to the cache once we
#we know we have filled it up
if($cache_count < $SEQ_REGION_CACHE_SIZE) {
my $key = lc($name) . ':'. $cs_key;
$name_cache->{$key} = [$seq_region_id, $length];
$id_cache->{$seq_region_id} = [$name, $length, $cs];
if($cache_count < $Bio::EnsEMBL::Utils::SEQ_REGION_CACHE_SIZE) {
my $arr = [ $seq_region_id, $name, $cs_id, $length ];
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$name:$cs_id"} =
$arr;
$Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$seq_region_id"} =
$arr;
$cache_count++;
}
......@@ -655,17 +658,6 @@ sub fetch_all {
}
sub deleteObj {
my $self = shift;
$self->SUPER::deleteObj;
$self->{'_id_cache'} = {};
$self->{'_name_cache'} = {};
$self->{'_exc_cache'} = {};
}
=head2 fetch_by_band
......
use strict;
use Bio::EnsEMBL::Utils::Cache;
package Bio::EnsEMBL::Utils::SeqRegionCache;
my $SEQ_REGION_CACHE_SIZE = 4000;
my %sr_id_cache;
my %sr_name_cache;
tie(%sr_name_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE);
tie(%sr_name_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE);
1;
#
# the items to cache should be listrefs to
# [ sr_id, sr_name, cs_id, sr_length ]
#
# The name cache key is "sr_name:cs_id"
# The id cache is keyed on "sr_id"
#
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment