From be71ab1774bfda68bc93139d60469c1c167dd89c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kusalananda=20K=C3=A4h=C3=A4ri?= <ak4@sanger.ac.uk> Date: Tue, 20 Jul 2010 14:12:41 +0000 Subject: [PATCH] Forgot patching this with Dan's patch... --- misc-scripts/id_mapping/dump_cache.pl | 137 ++++++++++++++++++++------ 1 file changed, 106 insertions(+), 31 deletions(-) diff --git a/misc-scripts/id_mapping/dump_cache.pl b/misc-scripts/id_mapping/dump_cache.pl index cb0ad996b0..b05deeba9b 100755 --- a/misc-scripts/id_mapping/dump_cache.pl +++ b/misc-scripts/id_mapping/dump_cache.pl @@ -137,18 +137,26 @@ sub build_cache_auto { my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache'; inject($cache_impl); - my $cache = $cache_impl->new( - -LOGGER => $logger, - -CONF => $conf, - ); - + # EG - populate cache for each species in turn $logger->debug("\nChecking number of toplevel seq_regions...\n"); - my $max = 0; - + my $max = 0; + my @species_ids = @{ get_species_ids("target") }; foreach my $dbtype (qw(source target)) { - my $num = scalar(@{ $cache->slice_names($dbtype) }); - $max = $num if ($num > $max); - $logger->debug("$dbtype: $num.\n", 1); + + # populate the cache for each species in turn + for my $species (@species_ids) { + $conf->param( 'species_id', $$species[1] ); + $conf->param( 'species_name', $$species[0] ); + + my $cache = $cache_impl->new( -LOGGER => $logger, + -CONF => $conf, ); + + my $num = + scalar( @{ $cache->slice_names( $dbtype, @$species ) } ); + + $max = $num if ( $num > $max ); + $logger->debug( "$dbtype: $num.\n", 1 ); + } } my $threshold = $conf->param('build_cache_auto_threshold') || 100; @@ -177,19 +185,22 @@ sub build_cache_by_seq_region { system("mkdir -p $logpath") == 0 or $logger->error("Can't create lsf log dir $logpath: $!\n"); + + # EG get the list of species IDs for sources and targets + my @source_species_ids = @{ get_species_ids("source") }; + my @species_ids = @{ get_species_ids("target") }; + # load the cache implementation my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache'; inject($cache_impl); - my $cache = $cache_impl->new( - -LOGGER => $logger, - -CONF => $conf, - ); + # EG store the base directory onto which the species ID will be added + my $basedir = $conf->param('basedir'); # submit jobs to lsf foreach my $dbtype (qw(source target)) { - $logger->info("\n".ucfirst($dbtype)." db...\n", 0, 'stamped'); + # EG iterate over individual species for source and target # determine which slices need to be done my $filename = "$dbtype.dump_cache.slices.txt"; @@ -197,21 +208,47 @@ sub build_cache_by_seq_region { throw("Unable to open $logpath/$filename for writing: $!"); my $num_jobs = 0; - - foreach my $slice_name (@{ $cache->slice_names($dbtype) }) { - my $type = "$dbtype.$slice_name"; - unless ($cache->cache_file_exists($type)) { - print $fh "$slice_name\n"; - $num_jobs++; + for my $species (@species_ids) { + # EG set config based on species ID in turn + $conf->param( 'basedir', path_append( $basedir, $$species[1] ) ); + $conf->param( 'species_id', $$species[1] ); + $conf->param( 'species_name', $$species[0] ); + # EG load cache for current species ID + my $cache = $cache_impl->new( -LOGGER => $logger, + -CONF => $conf, ); + foreach my $slice_name ( + @{ $cache->slice_names( $dbtype, @$species ) } ) + { + my $type = "$dbtype.$slice_name"; + my $src_species_id; + for my $src_id (@source_species_ids) { + if ( $$species[1] == $$src_id[1] ) { + $src_species_id = $$src_id[1]; + last; + } + } + $logger->info( "\n" . ucfirst($dbtype) . " db...\n", + 0, 'stamped' ); + + foreach my $slice_name ( @{ $cache->slice_names($dbtype) } ) { + my $type = "$dbtype.$slice_name"; + unless ( $cache->cache_file_exists($type) ) { + print $fh "$slice_name\n"; + print $fh "$slice_name,$$species[0],$$species[1]," + . $src_species_id . "\n"; + $num_jobs++; + } + } } - } + unless ($num_jobs) { + $logger->info("All cache files for $dbtype exist.\n"); + next; + } + } ## end for my $species (@species_ids) close($fh); - - unless ($num_jobs) { - $logger->info("All cache files for $dbtype exist.\n"); - next; - } + # EG reset original basedir + $conf->param( 'basedir', $basedir ); # build lsf command my $lsf_name = 'dump_by_seq_region_'.time; @@ -226,10 +263,17 @@ sub build_cache_by_seq_region { cache_impl => $cache_impl, ); - my $cmd = qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX}; + # EG invoke perl with correct path rather than relying on shebang + my $cmd = + qq{perl -I ./modules } + . qq{./misc-scripts/id_mapping/dump_by_seq_region.pl } + . qq{$options --index \$LSB_JOBINDEX}; + my $pipe = - qq{|bsub -J '$lsf_name\[1-$num_jobs\]\%$concurrent' } + '|bsub ' + . $conf->param('lsf_opt_run') + . qq{ -J '$lsf_name\[1-$num_jobs\]\%$concurrent' } . qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out } . qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err } . $conf->param('lsf_opt_dump_cache'); @@ -251,8 +295,10 @@ sub build_cache_by_seq_region { # submit dependent job to monitor finishing of jobs $logger->info("Waiting for jobs to finish...\n", 0, 'stamped'); - my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } . - qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true}; + my $dependent_job = + qq{bsub -K -w "ended($lsf_name)" } + . $conf->param('lsf_opt_run_small') + . qq{ -o $logpath/dump_cache.$dbtype.depend.out /bin/true}; system($dependent_job) == 0 or $logger->error("Error submitting dependent job: $!\n"); @@ -306,3 +352,32 @@ sub build_cache_all { } +# EG new method for getting species IDs +sub get_species_ids { + + my ($prefix) = @_; + my @speciesIds; + my $dsn = + "DBI:mysql:database=" + . $conf->param("${prefix}dbname") + . ";host=" + . $conf->param("${prefix}host") + . ";port=" + . $conf->param("${prefix}port"); + + my $ensemblCoreDbh = DBI->connect( $dsn, + $conf->param("${prefix}user"), + $conf->param("${prefix}pass") ) + || die "Cannot connect to server: $DBI::errstr\n"; + + my $query = "SELECT DISTINCT meta_value, species_id + FROM meta WHERE meta_key = 'species.production_name'"; + + my $psmt = $ensemblCoreDbh->prepare($query); + $psmt->execute(); + + while ( my (@results) = $psmt->fetchrow() ) { + push @speciesIds, [ $results[0], $results[1] ]; + } + return \@speciesIds; +} ## end sub get_species_ids -- GitLab