From 3c8ea973f5656f19cc49ef6b13fb3e26355bc95c Mon Sep 17 00:00:00 2001 From: Kieron Taylor <ktaylor@ebi.ac.uk> Date: Thu, 4 Oct 2012 14:38:39 +0000 Subject: [PATCH] Revised fetch_all_by_Slice to allow for a proper feature count method that includes overlaps and clones. Response to lack of performance in generic_count for Variation (request from Laurent Gil) --- .../Bio/EnsEMBL/DBSQL/BaseFeatureAdaptor.pm | 370 +++++++++--------- 1 file changed, 185 insertions(+), 185 deletions(-) diff --git a/modules/Bio/EnsEMBL/DBSQL/BaseFeatureAdaptor.pm b/modules/Bio/EnsEMBL/DBSQL/BaseFeatureAdaptor.pm index 6d5b9d080b..fcaf131b15 100644 --- a/modules/Bio/EnsEMBL/DBSQL/BaseFeatureAdaptor.pm +++ b/modules/Bio/EnsEMBL/DBSQL/BaseFeatureAdaptor.pm @@ -590,200 +590,200 @@ sub _create_feature_fast { return $feature_type->new_fast($args); } -# -# helper function used by fetch_all_by_Slice_constraint method -# -sub _slice_fetch { - my ( $self, $slice, $orig_constraint ) = @_; +=head2 count_by_Slice - my $slice_start = $slice->start(); - my $slice_end = $slice->end(); - my $slice_strand = $slice->strand(); - my $slice_cs = $slice->coord_system(); - my $slice_seq_region = $slice->seq_region_name(); - my $slice_seq_region_id = $slice->get_seq_region_id(); + Arg [0] : Bio::EnsEMBL::Slice + Arg [1] : Custom SQL constraint + Description : Finds all features with at least partial overlap to the given + slice and sums them up + Returntype : Integer +=cut - #get the synonym and name of the primary_table - my @tabs = $self->_tables; - my ( $tab_name, $tab_syn ) = @{ $tabs[0] }; - - #find out what coordinate systems the features are in - my $mcc = $self->db->get_MetaCoordContainer(); - my @feat_css = (); - - my $mca = $self->db->get_MetaContainer(); - my $value_list = $mca->list_value_by_key( $tab_name . "build.level" ); - if ( @$value_list and $slice->is_toplevel() ) { - push @feat_css, $slice_cs; - } else { - @feat_css = - @{ $mcc->fetch_all_CoordSystems_by_feature_type($tab_name) }; - } +sub count_by_Slice { + my $self = shift; + my ($slice, $constraint) = @_; + my $count = 0; + + my $count_array = $self->_get_by_Slice($slice,$constraint,'count'); + foreach (@$count_array) { print "$_,";} + $count += $_ for @$count_array; + return $count; +} - my $asma = $self->db->get_AssemblyMapperAdaptor(); - my @features; +=head2 _get_by_Slice + Arg [0] : Bio::EnsEMBL::Slice to find all the features within + Arg [1] : SQL constraint string + Arg [2] : Type of query to run. Default behaviour is to select, but + 'count' is also valid + Description: Abstracted logic from _slice_fetch + Returntype : Listref of Bio::EnsEMBL::Feature, or integers for counting mode +=cut - # fetch the features from each coordinate system they are stored in -COORD_SYSTEM: foreach my $feat_cs (@feat_css) { +sub _get_by_Slice { + my $self = shift; + my ($slice, $orig_constraint, $query_type) = @_; + + # features can be scattered across multiple coordinate systems + my @tables = $self->_tables; + my ($table_name, $table_synonym) = @{ $tables[0] }; my $mapper; - my @coords; - my @ids; - - if ( $feat_cs->equals($slice_cs) ) { - # no mapping is required if this is the same coord system - - my $max_len = $self->_max_feature_length() - || $mcc->fetch_max_length_by_CoordSystem_feature_type( $feat_cs, - $tab_name ); - - my $constraint = $orig_constraint; - - my $sr_id; - if ( $slice->adaptor() ) { - $sr_id = $slice->adaptor()->get_seq_region_id($slice); - } else { - $sr_id = - $self->db()->get_SliceAdaptor()->get_seq_region_id($slice); - } - - # If there is mapping information, use the external_seq_region_id - # to get features. - - my @sr_ids = ($sr_id); - - while (1) { - my $ext_sr_id = $self->get_seq_region_id_external($sr_id); - - if ( $ext_sr_id == $sr_id ) { last } - - push( @sr_ids, $ext_sr_id ); - $sr_id = $ext_sr_id; - } - - $constraint .= " AND " if ($constraint); - - - $constraint .= "${tab_syn}.seq_region_id IN (" - . join( ',', @sr_ids ) . ") AND"; - - #faster query for 1bp slices where SNP data is not compressed - if ( $self->start_equals_end && $slice_start == $slice_end ) { - $constraint .= - " AND ${tab_syn}.seq_region_start = $slice_end" . - " AND ${tab_syn}.seq_region_end = $slice_start"; - - } else { - - if ( !$slice->is_circular() ) { - # Deal with the default case of a non-circular chromosome. - $constraint .= " ${tab_syn}.seq_region_start <= $slice_end AND " - . "${tab_syn}.seq_region_end >= $slice_start"; - - if ( $max_len ) { - my $min_start = $slice_start - $max_len; - $constraint .= " AND ${tab_syn}.seq_region_start >= $min_start"; - } - - } else { - # Deal with the case of a circular chromosome. - if ( $slice_start > $slice_end ) { - $constraint .= " ( ${tab_syn}.seq_region_start >= $slice_start " - . "OR ${tab_syn}.seq_region_start <= $slice_end " - . "OR ${tab_syn}.seq_region_end >= $slice_start " - . "OR ${tab_syn}.seq_region_end <= $slice_end " - . "OR ${tab_syn}.seq_region_start > ${tab_syn}.seq_region_end)"; - - } else { - $constraint .= " ((${tab_syn}.seq_region_start <= $slice_end " - . "AND ${tab_syn}.seq_region_end >= $slice_start) " - . "OR (${tab_syn}.seq_region_start > ${tab_syn}.seq_region_end " - . "AND (${tab_syn}.seq_region_start <= $slice_end " - . "OR ${tab_syn}.seq_region_end >= $slice_start)))"; - } - } - - } - - my $fs = $self->generic_fetch( $constraint, undef, $slice ); - - # features may still have to have coordinates made relative to slice - # start - $fs = $self->_remap( $fs, $mapper, $slice ); - - push @features, @$fs; + my @feature_coord_systems; + + my $meta_values = $self->db->get_MetaContainer->list_value_by_key( $table_name."build.level"); + if ( @$meta_values and $slice->is_toplevel() ) { + push @feature_coord_systems, $slice->coord_system(); } else { - $mapper = $asma->fetch_by_CoordSystems( $slice_cs, $feat_cs ); - - next unless defined $mapper; - - # Get list of coordinates and corresponding internal ids for - # regions the slice spans - @coords = - $mapper->map( $slice_seq_region, $slice_start, $slice_end, - $slice_strand, $slice_cs ); - - @coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords; - - next COORD_SYSTEM if ( !@coords ); - - @ids = map { $_->id() } @coords; - #coords are now id rather than name - # @ids = @{$asma->seq_regions_to_ids($feat_cs, \@ids)}; - - # When regions are large and only partially spanned - # by slice it is faster to to limit the query with - # start and end constraints. Take simple approach: - # use regional constraints if there are less than a - # specific number of regions covered. - - if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice') && $slice_cs->name() ne 'lrg') { + @feature_coord_systems = @{ $self->db->get_MetaCoordContainer->fetch_all_CoordSystems_by_feature_type($table_name)}; + } + + my $assembly_mapper_adaptor = $self->db->get_AssemblyMapperAdaptor(); + my @pan_coord_features; + + +COORD_SYSTEM: foreach my $coord_system (@feature_coord_systems) { + my @query_accumulator; + # Build up a combination of query constraints that will quickly establish the result set my $constraint = $orig_constraint; - my $id_str = join( ',', @ids ); - $constraint .= " AND " if ($constraint); - $constraint .= "${tab_syn}.seq_region_id IN ($id_str)"; - my $fs = $self->generic_fetch( $constraint, $mapper, $slice ); - - $fs = $self->_remap( $fs, $mapper, $slice ); - - push @features, @$fs; - - } else { - # do multiple split queries using start / end constraints - - my $max_len = ( - $self->_max_feature_length() - || $mcc->fetch_max_length_by_CoordSystem_feature_type( - $feat_cs, $tab_name - ) ); - - my $len = @coords; - for ( my $i = 0; $i < $len; $i++ ) { - my $constraint = $orig_constraint; - $constraint .= " AND " if ($constraint); - $constraint .= - "${tab_syn}.seq_region_id = " - . $ids[$i] . " AND " - . "${tab_syn}.seq_region_start <= " - . $coords[$i]->end() . " AND " - . "${tab_syn}.seq_region_end >= " - . $coords[$i]->start(); - - if ($max_len) { - my $min_start = $coords[$i]->start() - $max_len; - $constraint .= - " AND ${tab_syn}.seq_region_start >= $min_start"; - } - my $fs = $self->generic_fetch( $constraint, $mapper, $slice ); - - $fs = $self->_remap( $fs, $mapper, $slice ); + if ( $coord_system->equals( $slice->coord_system ) ) { + my $max_len = $self->_max_feature_length + || $self->db->get_MetaCoordContainer + ->fetch_max_length_by_CoordSystem_feature_type( $coord_system,$table_name ); + + my $seq_region_id; + if ( $slice->adaptor ) { + $seq_region_id = $slice->adaptor->get_seq_region_id($slice); + } else { + $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice); + } + + my @seq_region_ids = ($seq_region_id); + while (1) { + my $ext_seq_region_id = $self->get_seq_region_id_external($seq_region_id); + + if ( $ext_seq_region_id == $seq_region_id ) { last } + + push( @seq_region_ids, $ext_seq_region_id ); + $seq_region_id = $ext_seq_region_id; + } + + $constraint .= " AND " if ($constraint); + + $constraint .= ${table_synonym}.".seq_region_id IN (". join( ',', @seq_region_ids ) . ") AND "; + + #faster query for 1bp slices where SNP data is not compressed + if ( $self->start_equals_end && $slice->start == $slice->end ) { + $constraint .= " AND ".$table_synonym.".seq_region_start = ".$slice->end . + " AND ".$table_synonym.".seq_region_end = ".$slice->start; + + } else { + if ( !$slice->is_circular() ) { + # Deal with the default case of a non-circular chromosome. + $constraint .= $table_synonym.".seq_region_start <= ".$slice->end." AND " + .$table_synonym.".seq_region_end >= ".$slice->start; + + if ( $max_len ) { + my $min_start = $slice->start - $max_len; + $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start; + } + + } else { + # Deal with the case of a circular chromosome. + if ( $slice->start > $slice->end ) { + $constraint .= " ( ".$table_synonym.".seq_region_start >= ".$slice->start + . " OR ".$table_synonym.".seq_region_start <= ".$slice->end + . " OR ".$table_synonym.".seq_region_end >= ".$slice->start + . " OR ".$table_synonym.".seq_region_end <= ".$slice->end + . " OR ".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end"; + } else { + $constraint .= " ((".$table_synonym.".seq_region_start <= ".$slice->end + . " AND ".$table_synonym.".seq_region_end >= ".$slice->start.") " + . "OR (".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end" + . " AND (".$table_synonym.".seq_region_start <= ".$slice->end + . " OR ".$table_synonym.".seq_region_end >= ".$slice->start.")))"; + } + } + } + + push @query_accumulator, [$constraint,undef,$slice]; # $mapper intentionally absent here. + + } else { #Ćcoordinate systems do not match + $mapper = $assembly_mapper_adaptor->fetch_by_CoordSystems( $slice->coord_system(), $coord_system ); + next unless defined $mapper; + + # Get list of coordinates and corresponding internal ids for + # regions the slice spans + my @coords = $mapper->map( $slice->seq_region_name, $slice->start, $slice->end, + $slice->strand, $slice->coord_system ); + + @coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords; + + next COORD_SYSTEM if ( !@coords ); + + my @ids = map { $_->id() } @coords; + #coords are now id rather than name + + if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice') + && $slice->coord_system->name() ne 'lrg') { + $constraint = $orig_constraint; + my $id_str = join( ',', @ids ); + $constraint .= " AND " if ($constraint); + $constraint .= $table_synonym.".seq_region_id IN ($id_str)"; + + push @query_accumulator, [$constraint,$mapper,$slice]; + } else { + my $max_len = ( + $self->_max_feature_length() + || $self->db->get_MetaCoordContainer + ->fetch_max_length_by_CoordSystem_feature_type($coord_system, $table_name) + ); + + my $length = @coords; + for ( my $i = 0; $i < $length; $i++ ) { + $constraint = $orig_constraint; + $constraint .= " AND " if ($constraint); + $constraint .= $table_synonym.".seq_region_id = " + . $ids[$i] . " AND " + . $table_synonym.".seq_region_start <= " + . $coords[$i]->end() . " AND " + . $table_synonym.".seq_region_end >= " + . $coords[$i]->start(); + + if ($max_len) { + my $min_start = $coords[$i]->start() - $max_len; + $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start; + } + + push @query_accumulator, [$constraint,$mapper,$slice]; + } # end multi-query cycle + } # end else + + } # end else (coord sytems not matching) + + foreach my $query (@query_accumulator) { + my ($local_constraint,$local_mapper,$local_slice) = @$query; + if ($query_type and $query_type eq 'count') { + push @pan_coord_features, $self->generic_count($local_constraint); + } else { + + my $features = $self->generic_fetch( $local_constraint, $local_mapper, $local_slice ); + $features = $self->_remap( $features, $local_mapper, $local_slice ); + push @pan_coord_features, @$features; + } + } + + $mapper = undef; + } # End foreach + return \@pan_coord_features; +} +# +# helper function used by fetch_all_by_Slice_constraint method +# +sub _slice_fetch { + my ( $self, $slice, $orig_constraint ) = @_; - push @features, @$fs; - } - } ## end else [ if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice') && $slice_cs->name() ne 'lrg')] - } ## end else [ if ( $feat_cs->equals(...))] - } ## end foreach my $feat_cs (@feat_css) + my $features = $self->_get_by_Slice($slice,$orig_constraint); - return \@features; + return $features; } ## end sub _slice_fetch -- GitLab