Commit 1b0e7c46 authored by Graham McVicker's avatar Graham McVicker
Browse files

Improved speed of feature retrieval by:

-'Flatening' FeaturePair object so that it no longer contains 2 SeqFeatures; It is now isa SeqFeature with some additional Hit attributes (object still bhaves as if it contained two features
-Adding 'fast' constructors to DnaDnaAlignFeatures and PepDnaAlignFeatures
-Some other minor tweaks such as using bind columns instead of fetchrow_hashref
parent 0b63c72a
......@@ -88,13 +88,11 @@ use Bio::EnsEMBL::SeqFeature;
use vars qw(@ISA);
use strict;
@ISA = qw(Bio::EnsEMBL::FeaturePair Bio::EnsEMBL::Root);
@ISA = qw(Bio::EnsEMBL::FeaturePair);
sub new {
my ($class,@args) = @_;
my ($caller, $pkg, $line) = caller;
my $self = $class->SUPER::new(@args);
#print "calling new with @args\n";
my ($cigar_string,$features)
......
......@@ -146,7 +146,8 @@ sub fetch_by_Slice_and_pid {
and with a percentage identity greater than $pid. If
$logic_name is defined, only features with an analysis of type
$logic_name will be returned.
Returntype : list of Bio::EnsEMBL::*AlignFeature in chromosomal coordinates
Returntype : listref of
Bio::EnsEMBL::*AlignFeature in chromosomal coordinates
Exceptions : thrown if $pid is not defined
Caller : general
......
......@@ -48,9 +48,37 @@ use strict;
# Object preamble - inherits from Bio::EnsEMBL::Root
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use Bio::EnsEMBL::Utils::Cache;
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
my $SLICE_FEATURE_CACHE_SIZE = 12;
=head2 new
Arg [1] : list of args @args
Superclass constructor arguments
Example : none
Description: Constructor which just initializes internal cache structures
Returntype : Bio::EnsEMBL::BaseFeatureAdaptor
Exceptions : none
Caller : implementing subclass constructors
=cut
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
#initialize caching data structures
tie(%{$self->{'_slice_feature_cache'}},
'Bio::EnsEMBL::Utils::Cache',
$SLICE_FEATURE_CACHE_SIZE);
return $self;
}
=head2 generic_fetch
......@@ -100,16 +128,12 @@ sub generic_fetch {
}
my $sth = $self->prepare($sql);
$sth->execute();
my $hashref;
my @out;
while($hashref = $sth->fetchrow_hashref()) {
push @out, $self->_obj_from_hashref($hashref);
}
#print STDERR "SQL START\n\n";
$sth->execute();
#print STDERR "SQL END\n";
return @out;
return $self->_objs_from_sth($sth);
}
......@@ -137,7 +161,7 @@ sub fetch_by_dbID{
my $constraint = "${tablename}_id = $id";
#return first element of _generic_fetch list
my ($feat) = $self->generic_fetch($constraint);
my ($feat) = @{$self->generic_fetch($constraint)};
return $feat;
}
......@@ -239,10 +263,7 @@ sub fetch_by_Contig_and_score{
$constraint = "score > $score";
}
my @features =
$self->fetch_by_Contig_constraint($contig, $constraint, $logic_name);
return @features;
return $self->fetch_by_Contig_constraint($contig, $constraint, $logic_name);
}
......@@ -269,6 +290,13 @@ sub fetch_by_Contig_and_score{
sub fetch_by_Slice_constraint {
my($self, $slice, $constraint, $logic_name) = @_;
my $key = join($slice->name, $constraint, $logic_name);
#check the cache
if($self->{'_slice_feature_cache'}{$key}) {
return @{$self->{'_slice_feature_cache'}{$key}};
}
if(!$slice){
$self->throw("need a slice to work\n");
}
......@@ -276,7 +304,7 @@ sub fetch_by_Slice_constraint {
$self->throw("$slice isn't a slice");
}
my @features =
my $features =
$self->fetch_by_assembly_location_constraint($slice->chr_start,
$slice->chr_end,
$slice->chr_name,
......@@ -285,7 +313,7 @@ sub fetch_by_Slice_constraint {
$logic_name);
#convert from chromosomal coordinates to slice coordinates
foreach my $f (@features){
foreach my $f (@$features){
my $start = ($f->start - ($slice->chr_start - 1));
my $end = ($f->end - ($slice->chr_start - 1));
......@@ -294,7 +322,10 @@ sub fetch_by_Slice_constraint {
$f->attach_seq($slice);
}
return @features;
#update the cache
$self->{'_slice_feature_cache'}{$key} = $features;
return @$features;
}
......@@ -372,7 +403,7 @@ sub fetch_by_Slice_and_score {
Arg [5] : (optional) string $logic_name
the logic name of the type of features to obtain
Example : @feats = $adaptor->fetch_by_assembly_location(1, 10000, '9', 'NCBI30');
Description: Returns a list of features created from the database which are
Description: Returns a listref of features created from the database which
are in the assembly region defined by $start, $end, and $chr.
If $logic_name is defined, only features with an analysis
of type $logic_name will be returned.
......@@ -413,7 +444,8 @@ sub fetch_by_assembly_location{
and with a percentage identity greater than $pid. If
$logic_name is defined, only features with an analysis of type
$logic_name will be returned.
Returntype : list of Bio::EnsEMBL::*AlignFeature in chromosomal coordinates
Returntype : listref of Bio::EnsEMBL::*AlignFeature in
chromosomal coordinates
Exceptions : thrown if $score is not defined
Caller : general
......@@ -456,7 +488,8 @@ sub fetch_by_assembly_location_and_score{
and with a percentage identity greater than $pid. If
$logic_name is defined, only features with an analysis of type
$logic_name will be returned.
Returntype : list of Bio::EnsEMBL::*AlignFeature in chromosomal coordinates
Returntype : listref of Bio::EnsEMBL::*AlignFeature in chromosomal
coordinates
Exceptions : thrown if $score is not defined
Caller : BaseFeatureAdaptor
......@@ -491,12 +524,12 @@ sub fetch_by_assembly_location_constraint {
$constraint = "contig_id IN ($cid_list)";
}
my @features = $self->generic_fetch($constraint, $logic_name);
my $features = $self->generic_fetch($constraint, $logic_name);
my @out;
#convert the features to assembly coordinates from raw contig coordinates
foreach my $f (@features) {
while(my $f = shift @$features) {
#since feats were obtained in contig coords, attached seq is a contig
my $contig_id = $f->entire_seq->dbID();
my @coord_list =
......@@ -526,14 +559,10 @@ sub fetch_by_assembly_location_constraint {
$f->strand($coord->strand());
#$f->seqname($coord->id());
#
# Should we attach a slice of the entire chromosome here? (mcvicker)
#
push(@out,$f);
}
return @out;
return \@out;
}
......@@ -621,6 +650,26 @@ sub _obj_from_hashref {
. " subclass of AlignFeatureAdaptor");
}
=head2 deleteObj
Arg [1] : none
Example : none
Description: Cleans up internal caches and references to other objects so
that correct garbage collection may occur.
Returntype : none
Exceptions : none
Caller : Bio::EnsEMBL::DBConnection::deleteObj
=cut
sub deleteObj {
my $self = shift;
#flush feature cache
%{$self->{'_slice_feature_cache'}} = ();
}
1;
......@@ -93,6 +93,7 @@ sub _tablename {
sub _columns {
my $self = shift;
#warning, implementation of _objs_from_sth method depends on order of list
return qw(dna_align_feature_id contig_id analysis_id contig_start
contig_end contig_strand hit_start hit_name hit_strand
cigar_line evalue perc_ident score);
......@@ -161,68 +162,70 @@ sub store {
}
=head2 _obj_from_hashref
=head2 _objs_from_sth
Arg [1] : DBI array hash reference
the DBI hashref generated by selecting the columns specified
by _columns() from the table specified by _table()
Arg [1] : DBI statement handle $sth
an exectuted DBI statement handle generated by selecting
the columns specified by _columns() from the table specified
by _table()
Example : @dna_dna_align_feats = $self->_obj_from_hashref
Description: PROTECTED implementation of superclass abstract method.
Creates DnaDnaAlignFeature objects from a DBI hashref
Returntype : list of Bio::EnsEMBL::DnaDnaAlignFeatures
Returntype : listref of Bio::EnsEMBL::DnaDnaAlignFeatures
Exceptions : none
Caller : Bio::EnsEMBL::BaseFeatureAdaptor::generic_fetch
=cut
sub _obj_from_hashref {
my ($self,$hashref) = @_;
sub _objs_from_sth {
my ($self, $sth) = @_;
my ($dna_align_feature_id, $contig_id, $analysis_id, $contig_start,
$contig_end, $contig_strand, $hit_start, $hit_name, $hit_strand,
$cigar_line, $evalue, $perc_ident, $score);
my $rca = $self->db()->get_RawContigAdaptor();
my $contig = $rca->fetch_by_dbID($hashref->{'contig_id'});
my $aa = $self->db()->get_AnalysisAdaptor();
my $analysis = $aa->fetch_by_dbID($hashref->{'analysis_id'});
my $f1 = Bio::EnsEMBL::SeqFeature->new();
my $f2 = Bio::EnsEMBL::SeqFeature->new();
$f1->start($hashref->{'contig_start'});
$f1->end($hashref->{'contig_end'});
$f1->strand($hashref->{'contig_strand'});
$f1->score($hashref->{'score'});
$f1->percent_id($hashref->{'perc_ident'});
$f1->p_value($hashref->{'evalue'});
$f1->seqname($contig->name());
$f1->attach_seq($contig);
$f2->start($hashref->{'hit_start'});
$f2->end($hashref->{'hit_end'});
$f2->strand($hashref->{'hit_strand'});
$f2->percent_id($hashref->{'perc_ident'});
$f2->p_value($hashref->{'evalue'});
$f2->seqname($hashref->{'hit_name'});
$f1->analysis($analysis);
$f2->analysis($analysis);
my $cigar = $hashref->{'cigar_line'};
my $align_feat =
Bio::EnsEMBL::DnaDnaAlignFeature->new( -cigar_string => $cigar,
-feature1 => $f1,
-feature2 => $f2);
my @features = ();
#set the 'id' of the feature to the hit name
$align_feat->id($hashref->{'hit_name'});
$sth->bind_columns(\$dna_align_feature_id, \$contig_id, \$analysis_id,
\$contig_start, \$contig_end, \$contig_strand,
\$hit_start, \$hit_name, \$hit_strand, \$cigar_line,
\$evalue, \$perc_ident, \$score);
$align_feat->dbID($hashref->{'dna_align_feature_id'});
my ($analysis, $contig);
return $align_feat;
}
while($sth->fetch) {
$analysis = $aa->fetch_by_dbID($analysis_id);
$contig = $rca->fetch_by_dbID($contig_id);
#use a very fast (hack) constructor since we may be creating over 10000
#features at a time and normal object construction is too slow.
push @features, Bio::EnsEMBL::DnaDnaAlignFeature->new_fast(
{'_gsf_tag_hash' => {},
'_gsf_sub_array' => [],
'_parse_h' => {},
'_analysis' => $analysis,
'_gsf_start' => $contig_start,
'_gsf_end' => $contig_end,
'_gsf_strand' => $contig_strand,
'_gsf_score' => $hit_start,
'_seqname' => $contig->name,
'_percent_id' => $perc_ident,
'_p_value' => $evalue,
'_hstart' => $hit_start,
'_hend' => $hit_name,
'_hstrand' => $hit_strand,
'_hseqname' => $hit_name,
'_gsf_seq' => $contig,
'_cigar_string' => $cigar_line,
'_id' => $hit_name,
'_database_id' => $dna_align_feature_id});
}
return \@features;
}
1;
......
......@@ -57,7 +57,7 @@ use strict;
# Object preamble - inherits from Bio::EnsEMBL::Root
use Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor;
use Bio::EnsEMBL::DnaPepAlignFeature;
use Bio::EnsEMBL::PepDnaAlignFeature;
use Bio::EnsEMBL::SeqFeature;
@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAlignFeatureAdaptor);
......@@ -122,58 +122,70 @@ sub store{
}
#
# Internal functions not to called be anyone else
#
sub _obj_from_hashref {
my ($self, $hashref) = @_;
my $rca = $self->db()->get_RawContigAdaptor();
my $contig = $rca->fetch_by_dbID($hashref->{'contig_id'});
my $aa = $self->db()->get_AnalysisAdaptor();
my $analysis = $aa->fetch_by_dbID($hashref->{'analysis_id'});
my $f1 = Bio::EnsEMBL::SeqFeature->new();
my $f2 = Bio::EnsEMBL::SeqFeature->new();
$f1->start($hashref->{'contig_start'});
$f1->end($hashref->{'contig_end'});
$f1->strand($hashref->{'contig_strand'});
$f2->start($hashref->{'hit_start'});
$f2->end($hashref->{'hit_end'});
$f2->strand(1);
$f2->seqname($hashref->{'hit_name'});
$f1->score($hashref->{'score'});
$f1->p_value($hashref->{'evalue'});
$f1->percent_id($hashref->{'perc_ident'});
$f2->score($hashref->{'score'});
$f2->p_value($hashref->{'evalue'});
$f2->percent_id($hashref->{'perc_ident'});
=head2 _objs_from_sth
my $cigar = $hashref->{'cigar_line'};
Arg [1] : DBI statement handle $sth
an exectuted DBI statement handle generated by selecting
the columns specified by _columns() from the table specified
by _table()
Example : @dna_dna_align_feats = $self->_obj_from_hashref
Description: PROTECTED implementation of superclass abstract method.
Creates DnaDnaAlignFeature objects from a DBI hashref
Returntype : listref of Bio::EnsEMBL::ProteinAlignFeatures
Exceptions : none
Caller : Bio::EnsEMBL::BaseFeatureAdaptor::generic_fetch
my $dnapep = Bio::EnsEMBL::DnaPepAlignFeature->new(-feature1 => $f1,
-feature2 => $f2,
-cigar_string => $cigar);
=cut
$dnapep->analysis($analysis);
$dnapep->attach_seq($contig);
#set the 'id' of the feature to the hit name
$dnapep->id($hashref->{'hit_name'});
sub _objs_from_sth {
my ($self, $sth) = @_;
$dnapep->dbID($hashref->{'protein_align_feature_id'});
my ($protein_align_feature_id, $contig_id, $contig_start, $contig_end,
$analysis_id, $contig_strand, $hit_start, $hit_end, $hit_name,
$cigar_line, $evalue, $perc_ident, $score);
return $dnapep;
my $rca = $self->db()->get_RawContigAdaptor();
my $aa = $self->db()->get_AnalysisAdaptor();
my @features = ();
$sth->bind_columns(\$protein_align_feature_id, \$contig_id, \$contig_start,
\$contig_end, \$analysis_id, \$contig_strand, \$hit_start,
\$hit_end, \$hit_name, \$cigar_line, \$evalue,
\$perc_ident, \$score);
my($analysis, $contig);
while($sth->fetch) {
$analysis = $aa->fetch_by_dbID($analysis_id);
$contig = $rca->fetch_by_dbID($contig_id);
#use a very fast (hack) constructor - normal object construction is too
#slow for the number of features we are potentially dealing with
push @features, Bio::EnsEMBL::PepDnaAlignFeature->new_fast(
{'_gsf_tag_hash' => {},
'_gsf_sub_array' => [],
'_parse_h' => {},
'_analysis' => $analysis,
'_gsf_start' => $contig_start,
'_gsf_end' => $contig_end,
'_gsf_strand' => $contig_strand,
'_gsf_score' => $score,
'_seqname' => $contig->name,
'_percent_id' => $perc_ident,
'_p_value' => $evalue,
'_hstart' => $hit_start,
'_hend' => $hit_end,
'_hseqname' => $hit_name,
'_gsf_seq' => $contig,
'_cigar_string' => $cigar_line,
'_id' => $hit_name,
'_database_id' => $protein_align_feature_id});
}
return \@features;
}
sub _tablename {
my $self = shift;
......@@ -182,7 +194,8 @@ sub _tablename {
sub _columns {
my $self = shift;
#warning _objs_from_hashref method depends on ordering of this list
return qw( protein_align_feature_id contig_id contig_start contig_end
analysis_id contig_strand hit_start hit_end hit_name cigar_line
evalue perc_ident score );
......
......@@ -79,8 +79,7 @@ sub new {
#Initialize caching data structures
tie(%{$self->{_raw_contig_cache}},
'Bio::EnsEMBL::Utils::Cache',
$RAW_CONTIG_CACHE_SIZE,
{Debug =>0});
$RAW_CONTIG_CACHE_SIZE);
return $self;
}
......
......@@ -144,45 +144,52 @@ sub generic_fetch {
=cut
sub _obj_from_hashref {
my ($self, $hashref) = @_;
sub _obj_from_sth {
my ($self, $sth) = @_;
my $rca = $self->db()->get_RepeatConsensusAdaptor();
#create a repeat consensus object
my $rc = new Bio::EnsEMBL::RepeatConsensus;
$rc->dbID($hashref->{'repeat_consensus_id'});
$rc->repeat_class($hashref->{'repeat_class'});
$rc->name($hashref->{'repeat_name'});
$rc->repeat_consensus($hashref->{'repeat_consensus'});
$rc->adaptor($rca);
#get the analysis object for this repeat
my $ca = $self->db()->get_RawContigAdaptor();
my $aa = $self->db->get_AnalysisAdaptor();
my $analysis = $aa->fetch_by_dbID($hashref->{'analysis_id'});
#create the new repeat feature
my $r = new Bio::EnsEMBL::RepeatFeature;
$r->dbID($hashref->{'repeat_feature_id'});
$r->start($hashref->{'contig_start'});
$r->end($hashref->{'contig_end'});
$r->score($hashref->{'score'});
$r->strand( $hashref->{'contig_strand'} );
$r->hstart( $hashref->{'repeat_start'} );
$r->hend( $hashref->{'repeat_end'} );
my $hashref;
my @features = ();
while($hashref = $sth->fetchrow_hashref()) {
#create a repeat consensus object
my $rc = new Bio::EnsEMBL::RepeatConsensus;
$rc->dbID($hashref->{'repeat_consensus_id'});
$rc->repeat_class($hashref->{'repeat_class'});
$rc->name($hashref->{'repeat_name'});
$rc->repeat_consensus($hashref->{'repeat_consensus'});
$rc->adaptor($rca);
my $analysis = $aa->fetch_by_dbID($hashref->{'analysis_id'});
$r->analysis($analysis);
$r->repeat_consensus($rc);
$r->adaptor($self);
#create the new repeat feature
my $r = new Bio::EnsEMBL::RepeatFeature;
$r->dbID($hashref->{'repeat_feature_id'});
$r->start($hashref->{'contig_start'});
$r->end($hashref->{'contig_end'});
$r->score($hashref->{'score'});
$r->strand( $hashref->{'contig_strand'} );
$r->hstart( $hashref->{'repeat_start'} );
$r->hend( $hashref->{'repeat_end'} );
$r->analysis($analysis);
$r->repeat_consensus($rc);
$r->adaptor($self);
#attach the appropriate contig to this sequence
my $ca = $self->db()->get_RawContigAdaptor();
my $contig = $ca->fetch_by_dbID($hashref->{'contig_id'});
$r->attach_seq($contig);
return $r;
#attach the appropriate contig to this sequence
my $contig = $ca->fetch_by_dbID($hashref->{'contig_id'});
$r->attach_seq($contig);
push @features, $r;
}
return \@features;
}
......
......@@ -141,43 +141,49 @@ sub _columns {
}
=head2 _obj_from_hashref
=head2 _objs_from_sth
Arg [1] : hash reference $hashref
Example : none