From be71ab1774bfda68bc93139d60469c1c167dd89c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kusalananda=20K=C3=A4h=C3=A4ri?=
 <ak4@sanger.ac.uk>
Date: Tue, 20 Jul 2010 14:12:41 +0000
Subject: [PATCH] Forgot patching this with Dan's patch...

---
 misc-scripts/id_mapping/dump_cache.pl | 137 ++++++++++++++++++++------
 1 file changed, 106 insertions(+), 31 deletions(-)

diff --git a/misc-scripts/id_mapping/dump_cache.pl b/misc-scripts/id_mapping/dump_cache.pl
index cb0ad996b0..b05deeba9b 100755
--- a/misc-scripts/id_mapping/dump_cache.pl
+++ b/misc-scripts/id_mapping/dump_cache.pl
@@ -137,18 +137,26 @@ sub build_cache_auto {
   my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
   inject($cache_impl);
 
-  my $cache = $cache_impl->new(
-    -LOGGER       => $logger,
-    -CONF         => $conf,
-  );
-
+  # EG - populate cache for each species in turn
   $logger->debug("\nChecking number of toplevel seq_regions...\n");
-  my $max = 0;
-
+  my $max         = 0;
+  my @species_ids = @{ get_species_ids("target") };
   foreach my $dbtype (qw(source target)) {
-    my $num = scalar(@{ $cache->slice_names($dbtype) });
-    $max = $num if ($num > $max);
-    $logger->debug("$dbtype: $num.\n", 1);
+
+    # populate the cache for each species in turn
+    for my $species (@species_ids) {
+      $conf->param( 'species_id',   $$species[1] );
+      $conf->param( 'species_name', $$species[0] );
+
+      my $cache = $cache_impl->new( -LOGGER => $logger,
+                                    -CONF   => $conf, );
+
+      my $num =
+        scalar( @{ $cache->slice_names( $dbtype, @$species ) } );
+
+      $max = $num if ( $num > $max );
+      $logger->debug( "$dbtype: $num.\n", 1 );
+    }
   }
 
   my $threshold = $conf->param('build_cache_auto_threshold') || 100;
@@ -177,19 +185,22 @@ sub build_cache_by_seq_region {
   system("mkdir -p $logpath") == 0 or
     $logger->error("Can't create lsf log dir $logpath: $!\n");
 
+
+  # EG get the list of species IDs for sources and targets
+  my @source_species_ids = @{ get_species_ids("source") };
+  my @species_ids        = @{ get_species_ids("target") };
+
   # load the cache implementation
   my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
   inject($cache_impl);
 
-  my $cache = $cache_impl->new(
-    -LOGGER       => $logger,
-    -CONF         => $conf,
-  );
+  # EG store the base directory onto which the species ID will be added
+  my $basedir = $conf->param('basedir');
 
   # submit jobs to lsf
   foreach my $dbtype (qw(source target)) {
 
-    $logger->info("\n".ucfirst($dbtype)." db...\n", 0, 'stamped');
+    # EG iterate over individual species for source and target
 
     # determine which slices need to be done
     my $filename = "$dbtype.dump_cache.slices.txt";
@@ -197,21 +208,47 @@ sub build_cache_by_seq_region {
       throw("Unable to open $logpath/$filename for writing: $!");
     
     my $num_jobs = 0;
-
-    foreach my $slice_name (@{ $cache->slice_names($dbtype) }) {
-      my $type = "$dbtype.$slice_name";
-      unless ($cache->cache_file_exists($type)) {
-        print $fh "$slice_name\n";
-        $num_jobs++;
+    for my $species (@species_ids) {
+      # EG set config based on species ID in turn
+      $conf->param( 'basedir', path_append( $basedir, $$species[1] ) );
+      $conf->param( 'species_id',   $$species[1] );
+      $conf->param( 'species_name', $$species[0] );
+      # EG load cache for current species ID
+      my $cache = $cache_impl->new( -LOGGER => $logger,
+                                    -CONF   => $conf, );
+      foreach my $slice_name (
+                        @{ $cache->slice_names( $dbtype, @$species ) } )
+      {
+        my $type = "$dbtype.$slice_name";
+        my $src_species_id;
+        for my $src_id (@source_species_ids) {
+          if ( $$species[1] == $$src_id[1] ) {
+            $src_species_id = $$src_id[1];
+            last;
+          }
+        }
+        $logger->info( "\n" . ucfirst($dbtype) . " db...\n",
+                       0, 'stamped' );
+
+        foreach my $slice_name ( @{ $cache->slice_names($dbtype) } ) {
+          my $type = "$dbtype.$slice_name";
+          unless ( $cache->cache_file_exists($type) ) {
+            print $fh "$slice_name\n";
+            print $fh "$slice_name,$$species[0],$$species[1],"
+              . $src_species_id . "\n";
+            $num_jobs++;
+          }
+        }
       }
-    }
 
+      unless ($num_jobs) {
+        $logger->info("All cache files for $dbtype exist.\n");
+        next;
+      }
+    } ## end for my $species (@species_ids)
     close($fh);
-
-    unless ($num_jobs) {
-      $logger->info("All cache files for $dbtype exist.\n");
-      next;
-    }
+    # EG reset original basedir
+    $conf->param( 'basedir', $basedir );
 
     # build lsf command
     my $lsf_name = 'dump_by_seq_region_'.time;
@@ -226,10 +263,17 @@ sub build_cache_by_seq_region {
         cache_impl    => $cache_impl,
     );
 
-    my $cmd = qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX};
+    # EG invoke perl with correct path rather than relying on shebang
+    my $cmd =
+        qq{perl -I ./modules }
+      . qq{./misc-scripts/id_mapping/dump_by_seq_region.pl }
+      . qq{$options --index \$LSB_JOBINDEX};
+
 
     my $pipe =
-        qq{|bsub -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
+        '|bsub '
+      . $conf->param('lsf_opt_run')
+      . qq{ -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
       . qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out }
       . qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err }
       . $conf->param('lsf_opt_dump_cache');
@@ -251,8 +295,10 @@ sub build_cache_by_seq_region {
     # submit dependent job to monitor finishing of jobs
     $logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
 
-    my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
-      qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true};
+    my $dependent_job =
+        qq{bsub -K -w "ended($lsf_name)" }
+      . $conf->param('lsf_opt_run_small')
+      . qq{ -o $logpath/dump_cache.$dbtype.depend.out /bin/true};
 
     system($dependent_job) == 0 or
       $logger->error("Error submitting dependent job: $!\n");
@@ -306,3 +352,32 @@ sub build_cache_all {
 }
 
 
+# EG new method for getting species IDs
+sub get_species_ids {
+
+  my ($prefix) = @_;
+  my @speciesIds;
+  my $dsn =
+      "DBI:mysql:database="
+    . $conf->param("${prefix}dbname")
+    . ";host="
+    . $conf->param("${prefix}host")
+    . ";port="
+    . $conf->param("${prefix}port");
+
+  my $ensemblCoreDbh = DBI->connect( $dsn,
+                                     $conf->param("${prefix}user"),
+                                     $conf->param("${prefix}pass") )
+    || die "Cannot connect to server: $DBI::errstr\n";
+
+  my $query = "SELECT DISTINCT meta_value, species_id
+               FROM meta WHERE meta_key = 'species.production_name'";
+
+  my $psmt = $ensemblCoreDbh->prepare($query);
+  $psmt->execute();
+
+  while ( my (@results) = $psmt->fetchrow() ) {
+    push @speciesIds, [ $results[0], $results[1] ];
+  }
+  return \@speciesIds;
+} ## end sub get_species_ids
-- 
GitLab