From 805432c6909e85f83845641d87ecd7c84c210b9e Mon Sep 17 00:00:00 2001
From: Magali Ruffier <mr6@ebi.ac.uk>
Date: Wed, 5 Sep 2018 16:09:25 +0100
Subject: [PATCH] ENSCORESW-2850: optimised for single species run

---
 .../xref_mapping/XrefParser/ProcessData.pm    | 49 +++++++++----------
 misc-scripts/xref_mapping/xref_parser.pl      | 14 ++----
 2 files changed, 28 insertions(+), 35 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefParser/ProcessData.pm b/misc-scripts/xref_mapping/XrefParser/ProcessData.pm
index b38ad24eba..79a7fa50d1 100644
--- a/misc-scripts/xref_mapping/XrefParser/ProcessData.pm
+++ b/misc-scripts/xref_mapping/XrefParser/ProcessData.pm
@@ -55,7 +55,7 @@ sub run {
   my $unzip      = $ref_arg->{unzip};
   my $stats      = $ref_arg->{stats};
   my $cleanup    = $ref_arg->{cleanup};
-  my $rspecies   = $ref_arg->{speciesr};
+  my $species    = $ref_arg->{species};
   my $taxon_id   = $ref_arg->{taxon};
   my $division   = $ref_arg->{division};
   my $sources    = $ref_arg->{sourcesr};
@@ -101,21 +101,23 @@ DSS
   my $dep_sth = $dbi->prepare($sql);
 
   # validate species names
-  if (defined $division) { push @$rspecies, $division; }
-  my @species_ids = $self->validate_species($rspecies, $verbose);
-  if (defined $taxon_id) { push @species_ids, $taxon_id; }
+  my $species_id = $self->validate_species($species, $verbose);
+  my $division_id = $self->validate_species($division, $verbose);
+  my @species_sources = ($species_id);
+  push @species_sources, $division_id if defined $division_id;
+  push @species_sources, $taxon_id if defined $taxon_id;
 
   # validate source names
-  exit(1) if ( !$self->validate_sources(\@species_ids,$sources, $verbose) );
-  exit(1) if ( !$self->validate_sources(\@species_ids,$notsources, $verbose) );
+  exit(1) if ( !$self->validate_sources(\@species_sources,$sources, $verbose) );
+  exit(1) if ( !$self->validate_sources(\@species_sources,$notsources, $verbose) );
 
   # build SQL
   my $species_sql = "";
-  if (@species_ids) {
+  if (@species_sources) {
     $species_sql .= " AND su.species_id IN (";
-    for ( my $i = 0 ; $i < @species_ids ; $i++ ) {
+    for ( my $i = 0 ; $i < @species_sources; $i++ ) {
       $species_sql .= "," if ( $i != 0 );
-      $species_sql .= $species_ids[$i];
+      $species_sql .= $species_sources[$i];
     }
     $species_sql .= ") ";
   }
@@ -156,12 +158,12 @@ DSS
   $sth->execute();
 
   my ( $source_id, $source_url_id, $name, $url, $release_url,
-       $checksum, $parser, $species_id );
+       $checksum, $parser, $species_source_id );
 
     $sth->bind_columns( \$source_id,   \$source_url_id,
                         \$name,        \$url,
                         \$release_url, \$checksum,
-                        \$parser,      \$species_id );
+                        \$parser,      \$species_source_id );
 
   my $dir;
   my %summary = ();
@@ -241,7 +243,7 @@ DSS
     foreach my $file (@files) {
 	
       # check dependencies are loaded all ready
-      if(!($self->all_dependencies_loaded($source_id, $species_id, $name, $dep_sth))){
+      if(!($self->all_dependencies_loaded($source_id, $species_source_id, $name, $dep_sth))){
 	++$summary{$name}->{$parser};
 	next;
       }
@@ -682,26 +684,21 @@ sub dbi {
 ###########################################################
 sub validate_species {
   my ($self, $species, $verbose) = @_;
-  my @species_ids;
 
   my $dbi = $self->dbi();
   my $sth = $dbi->prepare("SELECT species_id, name FROM species WHERE LOWER(name)=? OR LOWER(aliases) REGEXP ?");
   my ($species_id, $species_name);
 
-  foreach my $sp (@$species) {
-
-    my $bind_arg = "^".lc($sp).",|^".lc($sp)."\$|,[ ]{0,1}".lc($sp)."[ ]{0,1},|,[ ]{0,1}".lc($sp)."\$";
-    $sth->execute(lc($sp), $bind_arg ); 
-    $sth->bind_columns(\$species_id, \$species_name);
-    if (my @row = $sth->fetchrow_array()) {
-      print "Species $sp is valid (name = " . $species_name . ", ID = " . $species_id . ")\n" if($verbose);
-      push @species_ids, $species_id;
-    } else {
-      print STDERR "Species $sp is not valid; valid species are:\n";
-      $self->show_valid_species();
-    }
+  my $bind_arg = "^".lc($species).",|^".lc($species)."\$|,[ ]{0,1}".lc($species)."[ ]{0,1},|,[ ]{0,1}".lc($species)."\$";
+  $sth->execute(lc($species), $bind_arg ); 
+  $sth->bind_columns(\$species_id, \$species_name);
+  if (my @row = $sth->fetchrow_array()) {
+    print "Species $species is valid (name = " . $species_name . ", ID = " . $species_id . ")\n" if($verbose);
+  } else {
+    print STDERR "Species $species is not valid; valid species are:\n";
+    $self->show_valid_species();
   }
-  return @species_ids;
+  return $species_id;
 }
 
 ############################################################
diff --git a/misc-scripts/xref_mapping/xref_parser.pl b/misc-scripts/xref_mapping/xref_parser.pl
index 8f591ac512..a62052fff2 100644
--- a/misc-scripts/xref_mapping/xref_parser.pl
+++ b/misc-scripts/xref_mapping/xref_parser.pl
@@ -72,7 +72,6 @@ if($ARGV[0]){
   exit(1);
 }
 
-my @species = split(/,/,join(',',$species)) if $species;
 my @sources  = split(/,/,join(',',$sources)) if $sources;
 my @notsource  = split(/,/,join(',',$notsource)) if $notsource;
 
@@ -103,7 +102,7 @@ $process->run({ host             => $host,
 		dbname           => $dbname,
 		user             => $user,
 		pass             => $pass,
-		speciesr         => \@species,
+		species          => $species,
                 taxon            => $taxon,
                 division         => $division,
 		sourcesr         => \@sources,
@@ -147,7 +146,7 @@ sub usage {
   print << "EOF";
 
   xref_parser.pl -user {user} -pass {password} -host {host} \\
-    -port {port} -dbname {database} -species {species1,species2} \\
+    -port {port} -dbname {database} -species {species} \\
     -source {source1,source2} -notsource {source1,source2} \\
     -create -setrelease -deletedownloaded -checkdownload -stats -verbose \\
     -cleanup -drop_db -download_path -unzip
@@ -162,14 +161,11 @@ sub usage {
 
   -dbname           Name of xref database to use/create.
 
-  -species          Which species to import. Multiple -species arguments
-                    and/or comma, separated lists of species are
-                    allowed. Species may be referred to by genus/species
+  -species          Which species to import. 
+                    Species may be referred to by genus/species
                     (e.g. homo_sapiens) or common aliases (e.g. human).
                     Specifying an unknown species will cause a list
-                    of valid species to be printed.  Not specifying a
-                    -species argument will result in all species being
-                    used.
+                    of valid species to be printed.
 
   -source           Which sources to import. Multiple -source arguments
                     and/or comma, separated lists of sources are
-- 
GitLab