From 805432c6909e85f83845641d87ecd7c84c210b9e Mon Sep 17 00:00:00 2001 From: Magali Ruffier <mr6@ebi.ac.uk> Date: Wed, 5 Sep 2018 16:09:25 +0100 Subject: [PATCH] ENSCORESW-2850: optimised for single species run --- .../xref_mapping/XrefParser/ProcessData.pm | 49 +++++++++---------- misc-scripts/xref_mapping/xref_parser.pl | 14 ++---- 2 files changed, 28 insertions(+), 35 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefParser/ProcessData.pm b/misc-scripts/xref_mapping/XrefParser/ProcessData.pm index b38ad24eba..79a7fa50d1 100644 --- a/misc-scripts/xref_mapping/XrefParser/ProcessData.pm +++ b/misc-scripts/xref_mapping/XrefParser/ProcessData.pm @@ -55,7 +55,7 @@ sub run { my $unzip = $ref_arg->{unzip}; my $stats = $ref_arg->{stats}; my $cleanup = $ref_arg->{cleanup}; - my $rspecies = $ref_arg->{speciesr}; + my $species = $ref_arg->{species}; my $taxon_id = $ref_arg->{taxon}; my $division = $ref_arg->{division}; my $sources = $ref_arg->{sourcesr}; @@ -101,21 +101,23 @@ DSS my $dep_sth = $dbi->prepare($sql); # validate species names - if (defined $division) { push @$rspecies, $division; } - my @species_ids = $self->validate_species($rspecies, $verbose); - if (defined $taxon_id) { push @species_ids, $taxon_id; } + my $species_id = $self->validate_species($species, $verbose); + my $division_id = $self->validate_species($division, $verbose); + my @species_sources = ($species_id); + push @species_sources, $division_id if defined $division_id; + push @species_sources, $taxon_id if defined $taxon_id; # validate source names - exit(1) if ( !$self->validate_sources(\@species_ids,$sources, $verbose) ); - exit(1) if ( !$self->validate_sources(\@species_ids,$notsources, $verbose) ); + exit(1) if ( !$self->validate_sources(\@species_sources,$sources, $verbose) ); + exit(1) if ( !$self->validate_sources(\@species_sources,$notsources, $verbose) ); # build SQL my $species_sql = ""; - if (@species_ids) { + if (@species_sources) { $species_sql .= " AND su.species_id IN ("; - for ( my $i = 0 ; $i < @species_ids ; $i++ ) { + for ( my $i = 0 ; $i < @species_sources; $i++ ) { $species_sql .= "," if ( $i != 0 ); - $species_sql .= $species_ids[$i]; + $species_sql .= $species_sources[$i]; } $species_sql .= ") "; } @@ -156,12 +158,12 @@ DSS $sth->execute(); my ( $source_id, $source_url_id, $name, $url, $release_url, - $checksum, $parser, $species_id ); + $checksum, $parser, $species_source_id ); $sth->bind_columns( \$source_id, \$source_url_id, \$name, \$url, \$release_url, \$checksum, - \$parser, \$species_id ); + \$parser, \$species_source_id ); my $dir; my %summary = (); @@ -241,7 +243,7 @@ DSS foreach my $file (@files) { # check dependencies are loaded all ready - if(!($self->all_dependencies_loaded($source_id, $species_id, $name, $dep_sth))){ + if(!($self->all_dependencies_loaded($source_id, $species_source_id, $name, $dep_sth))){ ++$summary{$name}->{$parser}; next; } @@ -682,26 +684,21 @@ sub dbi { ########################################################### sub validate_species { my ($self, $species, $verbose) = @_; - my @species_ids; my $dbi = $self->dbi(); my $sth = $dbi->prepare("SELECT species_id, name FROM species WHERE LOWER(name)=? OR LOWER(aliases) REGEXP ?"); my ($species_id, $species_name); - foreach my $sp (@$species) { - - my $bind_arg = "^".lc($sp).",|^".lc($sp)."\$|,[ ]{0,1}".lc($sp)."[ ]{0,1},|,[ ]{0,1}".lc($sp)."\$"; - $sth->execute(lc($sp), $bind_arg ); - $sth->bind_columns(\$species_id, \$species_name); - if (my @row = $sth->fetchrow_array()) { - print "Species $sp is valid (name = " . $species_name . ", ID = " . $species_id . ")\n" if($verbose); - push @species_ids, $species_id; - } else { - print STDERR "Species $sp is not valid; valid species are:\n"; - $self->show_valid_species(); - } + my $bind_arg = "^".lc($species).",|^".lc($species)."\$|,[ ]{0,1}".lc($species)."[ ]{0,1},|,[ ]{0,1}".lc($species)."\$"; + $sth->execute(lc($species), $bind_arg ); + $sth->bind_columns(\$species_id, \$species_name); + if (my @row = $sth->fetchrow_array()) { + print "Species $species is valid (name = " . $species_name . ", ID = " . $species_id . ")\n" if($verbose); + } else { + print STDERR "Species $species is not valid; valid species are:\n"; + $self->show_valid_species(); } - return @species_ids; + return $species_id; } ############################################################ diff --git a/misc-scripts/xref_mapping/xref_parser.pl b/misc-scripts/xref_mapping/xref_parser.pl index 8f591ac512..a62052fff2 100644 --- a/misc-scripts/xref_mapping/xref_parser.pl +++ b/misc-scripts/xref_mapping/xref_parser.pl @@ -72,7 +72,6 @@ if($ARGV[0]){ exit(1); } -my @species = split(/,/,join(',',$species)) if $species; my @sources = split(/,/,join(',',$sources)) if $sources; my @notsource = split(/,/,join(',',$notsource)) if $notsource; @@ -103,7 +102,7 @@ $process->run({ host => $host, dbname => $dbname, user => $user, pass => $pass, - speciesr => \@species, + species => $species, taxon => $taxon, division => $division, sourcesr => \@sources, @@ -147,7 +146,7 @@ sub usage { print << "EOF"; xref_parser.pl -user {user} -pass {password} -host {host} \\ - -port {port} -dbname {database} -species {species1,species2} \\ + -port {port} -dbname {database} -species {species} \\ -source {source1,source2} -notsource {source1,source2} \\ -create -setrelease -deletedownloaded -checkdownload -stats -verbose \\ -cleanup -drop_db -download_path -unzip @@ -162,14 +161,11 @@ sub usage { -dbname Name of xref database to use/create. - -species Which species to import. Multiple -species arguments - and/or comma, separated lists of species are - allowed. Species may be referred to by genus/species + -species Which species to import. + Species may be referred to by genus/species (e.g. homo_sapiens) or common aliases (e.g. human). Specifying an unknown species will cause a list - of valid species to be printed. Not specifying a - -species argument will result in all species being - used. + of valid species to be printed. -source Which sources to import. Multiple -source arguments and/or comma, separated lists of sources are -- GitLab