diff --git a/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm index bf09e81ec7d3912e67fcd3bd56d69cb23fbbf412..d823bd09632dc60330ae2991dd6036f39c2b7b0c 100644 --- a/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm @@ -69,6 +69,7 @@ use Bio::EnsEMBL::TopLevelAssemblyMapper; use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache use Bio::EnsEMBL::Utils::Exception qw(deprecate throw); +use Bio::EnsEMBL::Utils::SeqRegionCache; use integer; #do proper arithmetic bitshifts @@ -79,8 +80,6 @@ my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6 # if the mapper is bigger than that its flushed before registering new stuff: my $MAX_PAIR_COUNT = 1000; -#number of seq regions to remember ids fo -my $SEQ_REGION_CACHE_SIZE = 2500; =head2 new @@ -101,10 +100,6 @@ sub new { $self->{'_asm_mapper_cache'} = {}; - my %cache; - tie(%cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE); - $self->{'_sr_id_cache'} = \%cache; - return $self; } @@ -319,6 +314,7 @@ sub register_assembled { asm.cmp_end, asm.cmp_seq_region_id, sr.name, + sr.length, asm.ori, asm.asm_start, asm.asm_end @@ -339,10 +335,11 @@ sub register_assembled { $sth->execute($asm_seq_region_id, $region_start, $region_end, $cmp_cs_id); my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori, - $asm_start, $asm_end); + $asm_start, $asm_end, $cmp_seq_region_length); $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id, - \$cmp_seq_region, \$ori, \$asm_start, \$asm_end); + \$cmp_seq_region, \$cmp_seq_region_length, \$ori, + \$asm_start, \$asm_end); # # Load the unregistered regions of the mapper @@ -354,8 +351,12 @@ sub register_assembled { $asm_seq_region, $asm_start, $asm_end, $ori, $cmp_seq_region, $cmp_start, $cmp_end); - $self->{'_sr_id_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = - $cmp_seq_region_id; + my $arr = [ $cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_seq_region_length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$cmp_seq_region:$cmp_cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$cmp_seq_region_id"} = + $arr; } } @@ -372,15 +373,16 @@ sub _seq_region_name_to_id { ($sr_name && $cs_id) || throw('seq_region_name and coord_system_id args ' . 'are required'); - my $sr_id = $self->{'_sr_id_cache'}->{"$sr_name:$cs_id"}; - - return $sr_id if($sr_id); + my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$sr_name:$cs_id"}; + if( $arr ) { + return $arr->[0]; + } # Get the seq_region_id via the name. This would be quicker if we just # used internal ids instead but stored but then we lose the ability # the transform accross databases with different internal ids - my $sth = $self->prepare("SELECT seq_region_id " . + my $sth = $self->prepare("SELECT seq_region_id, length " . "FROM seq_region " . "WHERE name = ? AND coord_system_id = ?"); @@ -391,10 +393,15 @@ sub _seq_region_name_to_id { "in coord system $cs_id"); } - ($sr_id) = $sth->fetchrow_array(); + my ($sr_id, $sr_length) = $sth->fetchrow_array(); $sth->finish(); - $self->{'_sr_id_cache'}->{"$sr_name:$cs_id"} = $sr_id; + $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$sr_name:$cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$sr_id"} = + $arr; return $sr_id; } @@ -442,7 +449,8 @@ sub register_component { asm.asm_start, asm.asm_end, asm.asm_seq_region_id, - sr.name + sr.name, + sr.length FROM assembly asm, seq_region sr WHERE @@ -466,10 +474,15 @@ sub register_component { "component region cmp_seq_region_id=[$cmp_seq_region_id]"); } - my ($asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region) = + my ($asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_seq_region_length) = $sth->fetchrow_array(); - $self->{'_sr_id_cache'}->{"$asm_seq_region:$asm_cs_id"} = $asm_seq_region_id; + my $arr = [ $asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_seq_region_length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$asm_seq_region:$asm_cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$asm_seq_region_id"} = + $arr; $sth->finish(); @@ -574,6 +587,7 @@ sub register_chained { asm.cmp_end, asm.cmp_seq_region_id, sr.name, + sr.length, asm.ori, asm.asm_start, asm.asm_end @@ -593,6 +607,7 @@ sub register_chained { asm.asm_end, asm.asm_seq_region_id, sr.name, + sr.length, asm.ori, asm.cmp_start, asm.cmp_end @@ -655,11 +670,11 @@ sub register_chained { #load the start <-> mid mapper with the results and record the mid cs #ranges we just added to the mapper - my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, + my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length, $ori, $start_start, $start_end); $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, - \$mid_seq_region, \$ori, \$start_start, + \$mid_seq_region, \$mid_length, \$ori, \$start_start, \$start_end); while($sth->fetch()) { @@ -686,8 +701,12 @@ sub register_chained { } #update sr_name cache - $self->{'_sr_id_cache'}->{"$mid_seq_region:$mid_cs_id"} = - $mid_seq_region_id; + my $arr = [ $mid_seq_region_id, $mid_seq_region, $mid_cs_id, $mid_length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$mid_seq_region:$mid_cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$mid_seq_region_id"} = + $arr; push @mid_ranges,[$mid_seq_region_id,$mid_seq_region, $mid_start,$mid_end]; @@ -748,11 +767,11 @@ sub register_chained { #load the end <-> mid mapper with the results and record the mid cs #ranges we just added to the mapper - my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, + my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length, $ori, $mid_start, $mid_end); $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id, - \$end_seq_region, \$ori, \$mid_start, + \$end_seq_region, \$end_length, \$ori, \$mid_start, \$mid_end); while($sth->fetch()) { @@ -766,8 +785,12 @@ sub register_chained { ); #update sr_name cache - $self->{'_sr_id_cache'}->{"$end_seq_region:$end_cs_id"} = - $end_seq_region_id; + my $arr = [ $end_seq_region_id, $end_seq_region, $end_cs_id, $end_length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$end_seq_region:$end_cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$end_seq_region_id"} = + $arr; #register this region on the end coord system $end_registry->check_and_register($end_seq_region, $end_start, $end_end); @@ -852,7 +875,6 @@ sub register_chained { sub deleteObj { my $self = shift; - delete $self->{'_asm_mapper_cache'}; $self->SUPER::deleteObj(); } @@ -884,9 +906,12 @@ sub seq_regions_to_ids { my @out; foreach my $sr (@$seq_regions) { - my $id = $self->{'_sr_id_cache'}->{"$sr:$cs_id"}; - $id = $self->_seq_region_name_to_id($sr,$cs_id) if(!$id); - push @out, $id; + my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$sr:$cs_id"}; + if( $arr ) { + push( @out, $arr->[0] ); + } else { + push @out, $self->_seq_region_name_to_id($sr,$cs_id); + } } return \@out; diff --git a/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm index 2da06e01fb329f3c0ef324c1f97fd08055da9efd..d313fc02ac25c816795afda53b587cd96d9b8f77 100644 --- a/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm @@ -95,10 +95,10 @@ use Bio::EnsEMBL::Mapper; use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache +use Bio::EnsEMBL::Utils::SeqRegionCache; -@ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); -my $SEQ_REGION_CACHE_SIZE = 1000; +@ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); sub new { my $caller = shift; @@ -107,14 +107,6 @@ sub new { my $self = $class->SUPER::new(@_); - my %name_cache; - my %id_cache; - - tie(%name_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE); - tie(%id_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE); - - $self->{'_name_cache'} = \%name_cache; - $self->{'_id_cache'} = \%id_cache; return $self; } @@ -170,6 +162,10 @@ sub new { =cut + +# +# ARNE: This subroutine needs simplification!! +# sub fetch_by_region { my ($self, $coord_system_name, $seq_region_name, $start, $end, $strand, $version) = @_; @@ -209,7 +205,7 @@ sub fetch_by_region { $constraint = "sr.coord_system_id = ?"; - $key = lc(join(':',$seq_region_name,$cs->name(), $cs->version)); + $key = "$seq_region_name:".$cs->dbID(); } else { $sql = "SELECT sr.name, sr.seq_region_id, sr.length, " . " cs.coord_system_id " . @@ -225,10 +221,13 @@ sub fetch_by_region { #check the cache so we only go to the db if necessary my $length; - my $name_cache = $self->{'_name_cache'}; + my $arr; + if( $key ) { + $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{$key}; + } - if($key && exists($name_cache->{$key})) { - $length = $name_cache->{$key}->[1]; + if( $arr ) { + $length = $arr->[3]; } else { my $sth = $self->prepare($sql . " WHERE sr.name = ? AND " . $constraint); @@ -261,9 +260,11 @@ sub fetch_by_region { my $tmp_cs = ($cs) ? $cs : $csa->fetch_by_dbID($cs_id); # cache values for future reference - $key = lc(join(':',$tmp_name,$tmp_cs->name(),$tmp_cs->version)); - $name_cache->{$key} = [$id,$tmp_length]; - $self->{'_id_cache'}->{$id} = [$tmp_name,$tmp_length,$high_cs]; + my $arr = [ $id, $tmp_name, $cs_id, $tmp_length ]; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$tmp_name:$cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$id"} = + $arr; my $tmp_ver = substr($tmp_name, $prefix_len); @@ -291,15 +292,15 @@ sub fetch_by_region { my ($id, $cs_id); ($seq_region_name, $id, $length, $cs_id) = $sth->fetchrow_array(); $sth->finish(); - - if(!$cs) { - $cs = $csa->fetch_by_dbID($cs_id); - $key = lc(join(':',$seq_region_name,$cs->name(), $cs->version)); - } - - #cache results to speed up future queries - $name_cache->{$key} = [$id,$length]; - $self->{'_id_cache'}->{$id} = [$seq_region_name, $length, $cs]; + + # cahce to speed up for future queries + my $arr = [ $id, $seq_region_name, $cs_id, $length ]; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$seq_region_name:$cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$id"} = + $arr; + + $cs = $csa->fetch_by_dbID( $cs_id ); } } @@ -384,12 +385,13 @@ sub fetch_by_name { sub fetch_by_seq_region_id { my ($self, $seq_region_id) = @_; - my $id_cache = $self->{'_id_cache'}; - + my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{ $seq_region_id }; my ($name, $length, $cs); - if(exists $id_cache->{$seq_region_id}) { - ($name, $length, $cs) = @{$id_cache->{$seq_region_id}}; + if( $arr ) { + my $cs_id; + ($name, $cs_id, $length ) = ( $arr->[1], $arr->[2], $arr->[3] ); + $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); } else { my $sth = $self->prepare("SELECT name, length, coord_system_id " . "FROM seq_region " . @@ -406,9 +408,12 @@ sub fetch_by_seq_region_id { $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); #cache results to speed up repeated queries - $id_cache->{$seq_region_id} = [$name, $length, $cs]; - my $key = lc(join(':', $name, $cs->name, $cs->version)); - $self->{'_name_cache'}->{$key} = [$seq_region_id, $length]; + my $arr = [ $seq_region_id, $name, $cs_id, $length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$name:$cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$seq_region_id"} = + $arr; } return Bio::EnsEMBL::Slice->new(-COORD_SYSTEM => $cs, @@ -445,44 +450,42 @@ sub get_seq_region_id { if(!$slice || !ref($slice) || !$slice->isa('Bio::EnsEMBL::Slice')) { throw('Slice argument is required'); } - - my $cs_name = $slice->coord_system->name(); - my $cs_version = $slice->coord_system->version(); + my $seq_region_name = $slice->seq_region_name(); + my $key = $seq_region_name.":".$slice->coord_system->dbID(); + my $arr = $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$key"}; - my $key = lc(join(':', $seq_region_name,$cs_name,$cs_version)); - - my $name_cache = $self->{'_name_cache'}; - - if(exists($name_cache->{$key})) { - return $name_cache->{$key}->[0]; + if( $arr ) { + return $arr->[0]; } - my $csa = $self->db()->get_CoordSystemAdaptor(); - my $coord_system = $csa->fetch_by_name($cs_name,$cs_version); + my $cs_id = $slice->coord_system->dbID(); my $sth = $self->prepare("SELECT seq_region_id, length " . "FROM seq_region " . "WHERE name = ? AND coord_system_id = ?"); #force seq_region_name cast to string so mysql cannot treat as int - $sth->execute("$seq_region_name", $coord_system->dbID()); + $sth->execute("$seq_region_name", $cs_id ); if($sth->rows() != 1) { throw("Non existant or ambigous seq_region:\n" . - " coord_system=[$cs_name],\n" . - " name=[$seq_region_name],\n" . - " version=[$cs_version]"); + " coord_system=[$cs_id],\n" . + " name=[$seq_region_name],\n"); + } my($seq_region_id, $length) = $sth->fetchrow_array(); $sth->finish(); #cache information for future requests - $name_cache->{$key} = [$seq_region_id, $length]; - $self->{'_id_cache'}->{$seq_region_id} = - [$seq_region_name, $length, $coord_system]; - + $arr = [ $seq_region_id, $seq_region_name, $cs_id, $length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$seq_region_name:$cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$seq_region_id"} = + $arr; + return $seq_region_id; } @@ -592,8 +595,6 @@ sub fetch_all { my ($seq_region_id, $name, $length, $cs_id); $sth->bind_columns(\$seq_region_id, \$name, \$length, \$cs_id); - my $name_cache = $self->{'_name_cache'}; - my $id_cache = $self->{'_id_cache'}; my $cache_count = 0; my @out; @@ -605,14 +606,16 @@ sub fetch_all { throw("seq_region $name references non-existent coord_system $cs_id."); } - my $cs_key = lc($cs->name().':'.$cs_version); - #cache values for future reference, but stop adding to the cache once we #we know we have filled it up - if($cache_count < $SEQ_REGION_CACHE_SIZE) { - my $key = lc($name) . ':'. $cs_key; - $name_cache->{$key} = [$seq_region_id, $length]; - $id_cache->{$seq_region_id} = [$name, $length, $cs]; + if($cache_count < $Bio::EnsEMBL::Utils::SEQ_REGION_CACHE_SIZE) { + my $arr = [ $seq_region_id, $name, $cs_id, $length ]; + + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_name_cache{"$name:$cs_id"} = + $arr; + $Bio::EnsEMBL::Utils::SeqRegionCache::sr_id_cache{"$seq_region_id"} = + $arr; + $cache_count++; } @@ -655,17 +658,6 @@ sub fetch_all { } -sub deleteObj { - my $self = shift; - - $self->SUPER::deleteObj; - - $self->{'_id_cache'} = {}; - $self->{'_name_cache'} = {}; - $self->{'_exc_cache'} = {}; -} - - =head2 fetch_by_band diff --git a/modules/Bio/EnsEMBL/Utils/SeqRegionCache.pm b/modules/Bio/EnsEMBL/Utils/SeqRegionCache.pm new file mode 100644 index 0000000000000000000000000000000000000000..703af8507fa6e4e865f7d20ac0f3004ea89720cc --- /dev/null +++ b/modules/Bio/EnsEMBL/Utils/SeqRegionCache.pm @@ -0,0 +1,25 @@ +use strict; +use Bio::EnsEMBL::Utils::Cache; + + +package Bio::EnsEMBL::Utils::SeqRegionCache; + +my $SEQ_REGION_CACHE_SIZE = 4000; + +my %sr_id_cache; +my %sr_name_cache; + +tie(%sr_name_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE); +tie(%sr_name_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_REGION_CACHE_SIZE); + +1; + + +# +# the items to cache should be listrefs to +# [ sr_id, sr_name, cs_id, sr_length ] +# +# The name cache key is "sr_name:cs_id" +# The id cache is keyed on "sr_id" +# +