Skip to content
Snippets Groups Projects
Commit 3c8ea973 authored by Kieron Taylor's avatar Kieron Taylor :angry:
Browse files

Revised fetch_all_by_Slice to allow for a proper feature count method that...

Revised fetch_all_by_Slice to allow for a proper feature count method that includes overlaps and clones. Response to lack of performance in generic_count for Variation (request from Laurent Gil)
parent 381d9ff8
No related branches found
No related tags found
No related merge requests found
...@@ -590,200 +590,200 @@ sub _create_feature_fast { ...@@ -590,200 +590,200 @@ sub _create_feature_fast {
return $feature_type->new_fast($args); return $feature_type->new_fast($args);
} }
# =head2 count_by_Slice
# helper function used by fetch_all_by_Slice_constraint method
#
sub _slice_fetch {
my ( $self, $slice, $orig_constraint ) = @_;
my $slice_start = $slice->start(); Arg [0] : Bio::EnsEMBL::Slice
my $slice_end = $slice->end(); Arg [1] : Custom SQL constraint
my $slice_strand = $slice->strand(); Description : Finds all features with at least partial overlap to the given
my $slice_cs = $slice->coord_system(); slice and sums them up
my $slice_seq_region = $slice->seq_region_name(); Returntype : Integer
my $slice_seq_region_id = $slice->get_seq_region_id(); =cut
#get the synonym and name of the primary_table sub count_by_Slice {
my @tabs = $self->_tables; my $self = shift;
my ( $tab_name, $tab_syn ) = @{ $tabs[0] }; my ($slice, $constraint) = @_;
my $count = 0;
#find out what coordinate systems the features are in
my $mcc = $self->db->get_MetaCoordContainer(); my $count_array = $self->_get_by_Slice($slice,$constraint,'count');
my @feat_css = (); foreach (@$count_array) { print "$_,";}
$count += $_ for @$count_array;
my $mca = $self->db->get_MetaContainer(); return $count;
my $value_list = $mca->list_value_by_key( $tab_name . "build.level" ); }
if ( @$value_list and $slice->is_toplevel() ) {
push @feat_css, $slice_cs;
} else {
@feat_css =
@{ $mcc->fetch_all_CoordSystems_by_feature_type($tab_name) };
}
my $asma = $self->db->get_AssemblyMapperAdaptor(); =head2 _get_by_Slice
my @features; Arg [0] : Bio::EnsEMBL::Slice to find all the features within
Arg [1] : SQL constraint string
Arg [2] : Type of query to run. Default behaviour is to select, but
'count' is also valid
Description: Abstracted logic from _slice_fetch
Returntype : Listref of Bio::EnsEMBL::Feature, or integers for counting mode
=cut
# fetch the features from each coordinate system they are stored in sub _get_by_Slice {
COORD_SYSTEM: foreach my $feat_cs (@feat_css) { my $self = shift;
my ($slice, $orig_constraint, $query_type) = @_;
# features can be scattered across multiple coordinate systems
my @tables = $self->_tables;
my ($table_name, $table_synonym) = @{ $tables[0] };
my $mapper; my $mapper;
my @coords; my @feature_coord_systems;
my @ids;
my $meta_values = $self->db->get_MetaContainer->list_value_by_key( $table_name."build.level");
if ( $feat_cs->equals($slice_cs) ) { if ( @$meta_values and $slice->is_toplevel() ) {
# no mapping is required if this is the same coord system push @feature_coord_systems, $slice->coord_system();
my $max_len = $self->_max_feature_length()
|| $mcc->fetch_max_length_by_CoordSystem_feature_type( $feat_cs,
$tab_name );
my $constraint = $orig_constraint;
my $sr_id;
if ( $slice->adaptor() ) {
$sr_id = $slice->adaptor()->get_seq_region_id($slice);
} else {
$sr_id =
$self->db()->get_SliceAdaptor()->get_seq_region_id($slice);
}
# If there is mapping information, use the external_seq_region_id
# to get features.
my @sr_ids = ($sr_id);
while (1) {
my $ext_sr_id = $self->get_seq_region_id_external($sr_id);
if ( $ext_sr_id == $sr_id ) { last }
push( @sr_ids, $ext_sr_id );
$sr_id = $ext_sr_id;
}
$constraint .= " AND " if ($constraint);
$constraint .= "${tab_syn}.seq_region_id IN ("
. join( ',', @sr_ids ) . ") AND";
#faster query for 1bp slices where SNP data is not compressed
if ( $self->start_equals_end && $slice_start == $slice_end ) {
$constraint .=
" AND ${tab_syn}.seq_region_start = $slice_end" .
" AND ${tab_syn}.seq_region_end = $slice_start";
} else {
if ( !$slice->is_circular() ) {
# Deal with the default case of a non-circular chromosome.
$constraint .= " ${tab_syn}.seq_region_start <= $slice_end AND "
. "${tab_syn}.seq_region_end >= $slice_start";
if ( $max_len ) {
my $min_start = $slice_start - $max_len;
$constraint .= " AND ${tab_syn}.seq_region_start >= $min_start";
}
} else {
# Deal with the case of a circular chromosome.
if ( $slice_start > $slice_end ) {
$constraint .= " ( ${tab_syn}.seq_region_start >= $slice_start "
. "OR ${tab_syn}.seq_region_start <= $slice_end "
. "OR ${tab_syn}.seq_region_end >= $slice_start "
. "OR ${tab_syn}.seq_region_end <= $slice_end "
. "OR ${tab_syn}.seq_region_start > ${tab_syn}.seq_region_end)";
} else {
$constraint .= " ((${tab_syn}.seq_region_start <= $slice_end "
. "AND ${tab_syn}.seq_region_end >= $slice_start) "
. "OR (${tab_syn}.seq_region_start > ${tab_syn}.seq_region_end "
. "AND (${tab_syn}.seq_region_start <= $slice_end "
. "OR ${tab_syn}.seq_region_end >= $slice_start)))";
}
}
}
my $fs = $self->generic_fetch( $constraint, undef, $slice );
# features may still have to have coordinates made relative to slice
# start
$fs = $self->_remap( $fs, $mapper, $slice );
push @features, @$fs;
} else { } else {
$mapper = $asma->fetch_by_CoordSystems( $slice_cs, $feat_cs ); @feature_coord_systems = @{ $self->db->get_MetaCoordContainer->fetch_all_CoordSystems_by_feature_type($table_name)};
}
next unless defined $mapper;
my $assembly_mapper_adaptor = $self->db->get_AssemblyMapperAdaptor();
# Get list of coordinates and corresponding internal ids for my @pan_coord_features;
# regions the slice spans
@coords =
$mapper->map( $slice_seq_region, $slice_start, $slice_end, COORD_SYSTEM: foreach my $coord_system (@feature_coord_systems) {
$slice_strand, $slice_cs ); my @query_accumulator;
# Build up a combination of query constraints that will quickly establish the result set
@coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords;
next COORD_SYSTEM if ( !@coords );
@ids = map { $_->id() } @coords;
#coords are now id rather than name
# @ids = @{$asma->seq_regions_to_ids($feat_cs, \@ids)};
# When regions are large and only partially spanned
# by slice it is faster to to limit the query with
# start and end constraints. Take simple approach:
# use regional constraints if there are less than a
# specific number of regions covered.
if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice') && $slice_cs->name() ne 'lrg') {
my $constraint = $orig_constraint; my $constraint = $orig_constraint;
my $id_str = join( ',', @ids ); if ( $coord_system->equals( $slice->coord_system ) ) {
$constraint .= " AND " if ($constraint); my $max_len = $self->_max_feature_length
$constraint .= "${tab_syn}.seq_region_id IN ($id_str)"; || $self->db->get_MetaCoordContainer
my $fs = $self->generic_fetch( $constraint, $mapper, $slice ); ->fetch_max_length_by_CoordSystem_feature_type( $coord_system,$table_name );
$fs = $self->_remap( $fs, $mapper, $slice ); my $seq_region_id;
if ( $slice->adaptor ) {
push @features, @$fs; $seq_region_id = $slice->adaptor->get_seq_region_id($slice);
} else {
} else { $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice);
# do multiple split queries using start / end constraints }
my $max_len = ( my @seq_region_ids = ($seq_region_id);
$self->_max_feature_length() while (1) {
|| $mcc->fetch_max_length_by_CoordSystem_feature_type( my $ext_seq_region_id = $self->get_seq_region_id_external($seq_region_id);
$feat_cs, $tab_name
) ); if ( $ext_seq_region_id == $seq_region_id ) { last }
my $len = @coords; push( @seq_region_ids, $ext_seq_region_id );
for ( my $i = 0; $i < $len; $i++ ) { $seq_region_id = $ext_seq_region_id;
my $constraint = $orig_constraint; }
$constraint .= " AND " if ($constraint);
$constraint .= $constraint .= " AND " if ($constraint);
"${tab_syn}.seq_region_id = "
. $ids[$i] . " AND " $constraint .= ${table_synonym}.".seq_region_id IN (". join( ',', @seq_region_ids ) . ") AND ";
. "${tab_syn}.seq_region_start <= "
. $coords[$i]->end() . " AND " #faster query for 1bp slices where SNP data is not compressed
. "${tab_syn}.seq_region_end >= " if ( $self->start_equals_end && $slice->start == $slice->end ) {
. $coords[$i]->start(); $constraint .= " AND ".$table_synonym.".seq_region_start = ".$slice->end .
" AND ".$table_synonym.".seq_region_end = ".$slice->start;
if ($max_len) {
my $min_start = $coords[$i]->start() - $max_len; } else {
$constraint .= if ( !$slice->is_circular() ) {
" AND ${tab_syn}.seq_region_start >= $min_start"; # Deal with the default case of a non-circular chromosome.
} $constraint .= $table_synonym.".seq_region_start <= ".$slice->end." AND "
my $fs = $self->generic_fetch( $constraint, $mapper, $slice ); .$table_synonym.".seq_region_end >= ".$slice->start;
$fs = $self->_remap( $fs, $mapper, $slice ); if ( $max_len ) {
my $min_start = $slice->start - $max_len;
$constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
}
} else {
# Deal with the case of a circular chromosome.
if ( $slice->start > $slice->end ) {
$constraint .= " ( ".$table_synonym.".seq_region_start >= ".$slice->start
. " OR ".$table_synonym.".seq_region_start <= ".$slice->end
. " OR ".$table_synonym.".seq_region_end >= ".$slice->start
. " OR ".$table_synonym.".seq_region_end <= ".$slice->end
. " OR ".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end";
} else {
$constraint .= " ((".$table_synonym.".seq_region_start <= ".$slice->end
. " AND ".$table_synonym.".seq_region_end >= ".$slice->start.") "
. "OR (".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end"
. " AND (".$table_synonym.".seq_region_start <= ".$slice->end
. " OR ".$table_synonym.".seq_region_end >= ".$slice->start.")))";
}
}
}
push @query_accumulator, [$constraint,undef,$slice]; # $mapper intentionally absent here.
} else { #coordinate systems do not match
$mapper = $assembly_mapper_adaptor->fetch_by_CoordSystems( $slice->coord_system(), $coord_system );
next unless defined $mapper;
# Get list of coordinates and corresponding internal ids for
# regions the slice spans
my @coords = $mapper->map( $slice->seq_region_name, $slice->start, $slice->end,
$slice->strand, $slice->coord_system );
@coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords;
next COORD_SYSTEM if ( !@coords );
my @ids = map { $_->id() } @coords;
#coords are now id rather than name
if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice')
&& $slice->coord_system->name() ne 'lrg') {
$constraint = $orig_constraint;
my $id_str = join( ',', @ids );
$constraint .= " AND " if ($constraint);
$constraint .= $table_synonym.".seq_region_id IN ($id_str)";
push @query_accumulator, [$constraint,$mapper,$slice];
} else {
my $max_len = (
$self->_max_feature_length()
|| $self->db->get_MetaCoordContainer
->fetch_max_length_by_CoordSystem_feature_type($coord_system, $table_name)
);
my $length = @coords;
for ( my $i = 0; $i < $length; $i++ ) {
$constraint = $orig_constraint;
$constraint .= " AND " if ($constraint);
$constraint .= $table_synonym.".seq_region_id = "
. $ids[$i] . " AND "
. $table_synonym.".seq_region_start <= "
. $coords[$i]->end() . " AND "
. $table_synonym.".seq_region_end >= "
. $coords[$i]->start();
if ($max_len) {
my $min_start = $coords[$i]->start() - $max_len;
$constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
}
push @query_accumulator, [$constraint,$mapper,$slice];
} # end multi-query cycle
} # end else
} # end else (coord sytems not matching)
foreach my $query (@query_accumulator) {
my ($local_constraint,$local_mapper,$local_slice) = @$query;
if ($query_type and $query_type eq 'count') {
push @pan_coord_features, $self->generic_count($local_constraint);
} else {
my $features = $self->generic_fetch( $local_constraint, $local_mapper, $local_slice );
$features = $self->_remap( $features, $local_mapper, $local_slice );
push @pan_coord_features, @$features;
}
}
$mapper = undef;
} # End foreach
return \@pan_coord_features;
}
#
# helper function used by fetch_all_by_Slice_constraint method
#
sub _slice_fetch {
my ( $self, $slice, $orig_constraint ) = @_;
push @features, @$fs; my $features = $self->_get_by_Slice($slice,$orig_constraint);
}
} ## end else [ if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice') && $slice_cs->name() ne 'lrg')]
} ## end else [ if ( $feat_cs->equals(...))]
} ## end foreach my $feat_cs (@feat_css)
return \@features; return $features;
} ## end sub _slice_fetch } ## end sub _slice_fetch
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment