Commit a610ecc5 authored by Kieron Taylor's avatar Kieron Taylor 😠
Browse files

Added efficiency boost to fetch_all_by_external_name and sub-methods....

Added efficiency boost to fetch_all_by_external_name and sub-methods. Wildcards now escaped in positions 1 and 3 unless user supplies override flag.

POD updated accordingly.
parent faf7a63e
......@@ -1418,6 +1418,7 @@ sub list_gene_ids_by_external_db_id{
Arg [1] : string $external_name
Arg [2] : (optional) string $external_db_name
Arg [3] : Boolean override, see _type_by_external_id
Example : @gene_ids = $dbea->list_gene_ids_by_extids('CDPX');
Description: Retrieve a list of geneid by an external identifier that is
linked to any of the genes transcripts, translations or the
......@@ -1430,15 +1431,15 @@ sub list_gene_ids_by_external_db_id{
=cut
sub list_gene_ids_by_extids {
my ( $self, $external_name, $external_db_name ) = @_;
my ( $self, $external_name, $external_db_name, $override ) = @_;
my %T = map { ( $_, 1 ) }
$self->_type_by_external_id( $external_name, 'Translation', 'gene',
$external_db_name ),
$external_db_name, $override ),
$self->_type_by_external_id( $external_name, 'Transcript', 'gene',
$external_db_name ),
$external_db_name, $override ),
$self->_type_by_external_id( $external_name, 'Gene', undef,
$external_db_name );
$external_db_name, $override );
return keys %T;
}
......@@ -1448,6 +1449,7 @@ sub list_gene_ids_by_extids {
Arg [1] : string $external_name
Arg [2] : (optional) string $external_db_name
Arg [3] : Boolean override, see _type_by_external_id
Example : @tr_ids = $dbea->list_transcript_ids_by_extids('BCRA2');
Description: Retrieve a list transcript ids by an external identifier that
is linked to any of the genes transcripts, translations or the
......@@ -1460,14 +1462,14 @@ sub list_gene_ids_by_extids {
=cut
sub list_transcript_ids_by_extids {
my ( $self, $external_name, $external_db_name ) = @_;
my ( $self, $external_name, $external_db_name, $override ) = @_;
my %T = map { ( $_, 1 ) }
$self->_type_by_external_id( $external_name, 'Translation',
'transcript', $external_db_name
'transcript', $external_db_name, $override
),
$self->_type_by_external_id( $external_name, 'Transcript', undef,
$external_db_name );
$external_db_name, $override );
return keys %T;
}
......@@ -1477,6 +1479,7 @@ sub list_transcript_ids_by_extids {
Arg [1] : string $external_name
Arg [2] : (optional) string $external_db_name
Arg [3] : Boolean override, see _type_by_external_id
Example : @tr_ids = $dbea->list_translation_ids_by_extids('GO:0004835');
Description: Gets a list of translation IDs by external display IDs
Returntype : list of Ints
......@@ -1487,11 +1490,11 @@ sub list_transcript_ids_by_extids {
=cut
sub list_translation_ids_by_extids {
my ( $self, $external_name, $external_db_name ) = @_;
my ( $self, $external_name, $external_db_name, $override ) = @_;
return
$self->_type_by_external_id( $external_name, 'Translation', undef,
$external_db_name );
$external_db_name, $override );
}
=head2 _type_by_external_id
......@@ -1500,7 +1503,10 @@ sub list_translation_ids_by_extids {
Arg [2] : string $ensType - ensembl_object_type
Arg [3] : (optional) string $extraType
Arg [4] : (optional) string $external_db_name
other object type to be returned
other object type to be returned
Arg [5] : Boolean override to force _ to be treated as an SQL 'any'
This is usually optimised out for query speed due to
large numbers of names like NM_00...
Example : $self->_type_by_external_id($name, 'Translation');
NOTE: In a multi-species database, this method will
return all the entries matching the search criteria, not
......@@ -1512,23 +1518,28 @@ sub list_translation_ids_by_extids {
Exceptions : none
Caller : list_translation_ids_by_extids
translationids_by_extids
geneids_by_extids
geneids_by_extids
Status : Stable
=cut
sub _type_by_external_id {
my ( $self, $name, $ensType, $extraType, $external_db_name ) = @_;
my ( $self, $name, $ensType, $extraType, $external_db_name, $override ) = @_;
# $name has SQL wildcard support
# = or LIKE put into SQL statement, and open queries like % or A% are rejected.
my $comparison_operator;
if ($name =~ /[_%\[]/) {
if ($name =~ /[_%\[]/ ) {
$comparison_operator = "LIKE";
if ($name =~ /^.?%/) {
if ($name =~ /^.?%/ && !$override) {
warn "External $ensType name $name is too vague and will monopolise database resources. Please use a more specific $ensType name.\n";
return;
}
elsif ($name =~ /^\w\w_/ && !$override) {
# For entries such as NM_00000065, escape the _ so that SQL LIKE does not have to scan entire table
# Escape only the _ in the third character position
$name =~ s/(?<=\w\w)(?=_)/\\/;
}
}
else {
$comparison_operator = "=";
......
......@@ -800,6 +800,8 @@ sub fetch_by_translation_stable_id {
Arg [2] : (optional) String $external_db_name
The name of the external database from which the
identifier originates.
Arg [3] : Boolean override. Force SQL regex matching for users
who really do want to find all 'NM%'
Example : @genes = @{$gene_adaptor->fetch_all_by_external_name('BRCA2')}
@many_genes = @{$gene_adaptor->fetch_all_by_external_name('BRCA%')}
Description: Retrieves a list of genes with an external database
......@@ -808,7 +810,10 @@ sub fetch_by_translation_stable_id {
system they are stored in the database in. If another
coordinate system is required then the Gene::transfer or
Gene::transform method can be used.
SQL wildcards % and _ are supported in the $external_name
SQL wildcards % and _ are supported in the $external_name,
but their use is somewhat restricted for performance reasons.
Users that really do want % and _ in the first three characters
should use argument 3 to prevent optimisations
Returntype : listref of Bio::EnsEMBL::Gene
Exceptions : none
Caller : goview, general
......@@ -817,13 +822,13 @@ sub fetch_by_translation_stable_id {
=cut
sub fetch_all_by_external_name {
my ( $self, $external_name, $external_db_name ) = @_;
my ( $self, $external_name, $external_db_name, $override ) = @_;
my $entryAdaptor = $self->db->get_DBEntryAdaptor();
my @ids =
$entryAdaptor->list_gene_ids_by_extids( $external_name,
$external_db_name );
$external_db_name, $override );
my %genes_by_dbIDs =
map { $_->dbID(), $_ } @{ $self->fetch_all_by_dbID_list( \@ids ) };
......
......@@ -482,6 +482,8 @@ sub fetch_all_by_Slice {
Arg [2] : (optional) String $external_db_name
The name of the external database from which the
identifier originates.
Arg [3] : Boolean override. Force SQL regex matching for users
who really do want to find all 'NM%'
Example : my @transcripts =
@{ $tr_adaptor->fetch_all_by_external_name( 'NP_065811.1') };
my @more_transcripts =
......@@ -498,6 +500,9 @@ sub fetch_all_by_Slice {
If no transcripts with the external identifier are found,
a reference to an empty list is returned.
SQL wildcards % and _ are supported in the $external_name
but their use is somewhat restricted for performance reasons.
Users that really do want % and _ in the first three characters
should use argument 3 to prevent optimisations
Returntype : listref of Bio::EnsEMBL::Transcript
Exceptions : none
Caller : general
......@@ -506,13 +511,13 @@ sub fetch_all_by_Slice {
=cut
sub fetch_all_by_external_name {
my ( $self, $external_name, $external_db_name ) = @_;
my ( $self, $external_name, $external_db_name, $override) = @_;
my $entryAdaptor = $self->db->get_DBEntryAdaptor();
my @ids =
$entryAdaptor->list_transcript_ids_by_extids( $external_name,
$external_db_name );
$external_db_name, $override );
return $self->fetch_all_by_dbID_list( \@ids );
}
......
......@@ -278,6 +278,8 @@ sub fetch_by_Transcript {
Arg [2] : (optional) string $external_db_name
The name of the external database from which the
identifier originates.
Arg [3] : Boolean override. Force SQL regex matching for users
who really do want to find all 'NM%'
Example : my @translations =
@{ $trl_adaptor->fetch_all_by_external_name('BRCA2') };
my @many_translations =
......@@ -289,6 +291,9 @@ sub fetch_by_Transcript {
their transcript. It may be better to use the
TranscriptAdaptor::fetch_all_by_external_name instead.
SQL wildcards % and _ are supported in the $external_name
but their use is somewhat restricted for performance reasons.
Users that really do want % and _ in the first three characters
should use argument 3 to prevent optimisations
Returntype : reference to a list of Translations
Exceptions : none
Caller : general
......@@ -299,13 +304,12 @@ sub fetch_by_Transcript {
=cut
sub fetch_all_by_external_name {
my ( $self, $external_name, $external_db_name ) = @_;
my ( $self, $external_name, $external_db_name, $override ) = @_;
my $entry_adaptor = $self->db->get_DBEntryAdaptor();
my @ids =
$entry_adaptor->list_translation_ids_by_extids( $external_name,
$external_db_name );
my @ids = $entry_adaptor->list_translation_ids_by_extids(
$external_name, $external_db_name, $override );
my $transcript_adaptor = $self->db()->get_TranscriptAdaptor();
......
......@@ -447,6 +447,21 @@ ok(($genes[0]->stable_id() eq 'ENSG00000174873') ||
debug($gene->stable_id);
ok($gene->stable_id() eq 'ENSG00000101367');
#
# test fetch_all_by_external_name with wildcard restrictions
#
(@genes) = @{ $ga->fetch_all_by_external_name('AF_%')};
# Should = 0 because _ is auto-escaped.
debug('Genes found under external_name AF_%: '.scalar(@genes));
ok(scalar(@genes) == 0);
(@genes) = @{ $ga->fetch_all_by_external_name('AF_%',undef,'override')};
debug('Genes found under external_name AF_% with override on: '.scalar(@genes));
debug($genes[0]->stable_id());
debug($genes[1]->stable_id());
debug($genes[2]->stable_id());
debug($genes[3]->stable_id());
# Note that 9 AF_% xrefs correspond to 4 unique ensembl IDs.
ok(scalar(@genes) == 4);
#
# test fetch_all_by_external_name with wildcard matching
#
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment