Skip to content
Snippets Groups Projects
Commit be71ab17 authored by Andreas Kusalananda Kähäri's avatar Andreas Kusalananda Kähäri
Browse files

Forgot patching this with Dan's patch...

parent f4be3985
No related branches found
No related tags found
No related merge requests found
...@@ -137,18 +137,26 @@ sub build_cache_auto { ...@@ -137,18 +137,26 @@ sub build_cache_auto {
my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache'; my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
inject($cache_impl); inject($cache_impl);
my $cache = $cache_impl->new( # EG - populate cache for each species in turn
-LOGGER => $logger,
-CONF => $conf,
);
$logger->debug("\nChecking number of toplevel seq_regions...\n"); $logger->debug("\nChecking number of toplevel seq_regions...\n");
my $max = 0; my $max = 0;
my @species_ids = @{ get_species_ids("target") };
foreach my $dbtype (qw(source target)) { foreach my $dbtype (qw(source target)) {
my $num = scalar(@{ $cache->slice_names($dbtype) });
$max = $num if ($num > $max); # populate the cache for each species in turn
$logger->debug("$dbtype: $num.\n", 1); for my $species (@species_ids) {
$conf->param( 'species_id', $$species[1] );
$conf->param( 'species_name', $$species[0] );
my $cache = $cache_impl->new( -LOGGER => $logger,
-CONF => $conf, );
my $num =
scalar( @{ $cache->slice_names( $dbtype, @$species ) } );
$max = $num if ( $num > $max );
$logger->debug( "$dbtype: $num.\n", 1 );
}
} }
my $threshold = $conf->param('build_cache_auto_threshold') || 100; my $threshold = $conf->param('build_cache_auto_threshold') || 100;
...@@ -177,19 +185,22 @@ sub build_cache_by_seq_region { ...@@ -177,19 +185,22 @@ sub build_cache_by_seq_region {
system("mkdir -p $logpath") == 0 or system("mkdir -p $logpath") == 0 or
$logger->error("Can't create lsf log dir $logpath: $!\n"); $logger->error("Can't create lsf log dir $logpath: $!\n");
# EG get the list of species IDs for sources and targets
my @source_species_ids = @{ get_species_ids("source") };
my @species_ids = @{ get_species_ids("target") };
# load the cache implementation # load the cache implementation
my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache'; my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
inject($cache_impl); inject($cache_impl);
my $cache = $cache_impl->new( # EG store the base directory onto which the species ID will be added
-LOGGER => $logger, my $basedir = $conf->param('basedir');
-CONF => $conf,
);
# submit jobs to lsf # submit jobs to lsf
foreach my $dbtype (qw(source target)) { foreach my $dbtype (qw(source target)) {
$logger->info("\n".ucfirst($dbtype)." db...\n", 0, 'stamped'); # EG iterate over individual species for source and target
# determine which slices need to be done # determine which slices need to be done
my $filename = "$dbtype.dump_cache.slices.txt"; my $filename = "$dbtype.dump_cache.slices.txt";
...@@ -197,21 +208,47 @@ sub build_cache_by_seq_region { ...@@ -197,21 +208,47 @@ sub build_cache_by_seq_region {
throw("Unable to open $logpath/$filename for writing: $!"); throw("Unable to open $logpath/$filename for writing: $!");
my $num_jobs = 0; my $num_jobs = 0;
for my $species (@species_ids) {
foreach my $slice_name (@{ $cache->slice_names($dbtype) }) { # EG set config based on species ID in turn
my $type = "$dbtype.$slice_name"; $conf->param( 'basedir', path_append( $basedir, $$species[1] ) );
unless ($cache->cache_file_exists($type)) { $conf->param( 'species_id', $$species[1] );
print $fh "$slice_name\n"; $conf->param( 'species_name', $$species[0] );
$num_jobs++; # EG load cache for current species ID
my $cache = $cache_impl->new( -LOGGER => $logger,
-CONF => $conf, );
foreach my $slice_name (
@{ $cache->slice_names( $dbtype, @$species ) } )
{
my $type = "$dbtype.$slice_name";
my $src_species_id;
for my $src_id (@source_species_ids) {
if ( $$species[1] == $$src_id[1] ) {
$src_species_id = $$src_id[1];
last;
}
}
$logger->info( "\n" . ucfirst($dbtype) . " db...\n",
0, 'stamped' );
foreach my $slice_name ( @{ $cache->slice_names($dbtype) } ) {
my $type = "$dbtype.$slice_name";
unless ( $cache->cache_file_exists($type) ) {
print $fh "$slice_name\n";
print $fh "$slice_name,$$species[0],$$species[1],"
. $src_species_id . "\n";
$num_jobs++;
}
}
} }
}
unless ($num_jobs) {
$logger->info("All cache files for $dbtype exist.\n");
next;
}
} ## end for my $species (@species_ids)
close($fh); close($fh);
# EG reset original basedir
unless ($num_jobs) { $conf->param( 'basedir', $basedir );
$logger->info("All cache files for $dbtype exist.\n");
next;
}
# build lsf command # build lsf command
my $lsf_name = 'dump_by_seq_region_'.time; my $lsf_name = 'dump_by_seq_region_'.time;
...@@ -226,10 +263,17 @@ sub build_cache_by_seq_region { ...@@ -226,10 +263,17 @@ sub build_cache_by_seq_region {
cache_impl => $cache_impl, cache_impl => $cache_impl,
); );
my $cmd = qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX}; # EG invoke perl with correct path rather than relying on shebang
my $cmd =
qq{perl -I ./modules }
. qq{./misc-scripts/id_mapping/dump_by_seq_region.pl }
. qq{$options --index \$LSB_JOBINDEX};
my $pipe = my $pipe =
qq{|bsub -J '$lsf_name\[1-$num_jobs\]\%$concurrent' } '|bsub '
. $conf->param('lsf_opt_run')
. qq{ -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
. qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out } . qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out }
. qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err } . qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err }
. $conf->param('lsf_opt_dump_cache'); . $conf->param('lsf_opt_dump_cache');
...@@ -251,8 +295,10 @@ sub build_cache_by_seq_region { ...@@ -251,8 +295,10 @@ sub build_cache_by_seq_region {
# submit dependent job to monitor finishing of jobs # submit dependent job to monitor finishing of jobs
$logger->info("Waiting for jobs to finish...\n", 0, 'stamped'); $logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } . my $dependent_job =
qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true}; qq{bsub -K -w "ended($lsf_name)" }
. $conf->param('lsf_opt_run_small')
. qq{ -o $logpath/dump_cache.$dbtype.depend.out /bin/true};
system($dependent_job) == 0 or system($dependent_job) == 0 or
$logger->error("Error submitting dependent job: $!\n"); $logger->error("Error submitting dependent job: $!\n");
...@@ -306,3 +352,32 @@ sub build_cache_all { ...@@ -306,3 +352,32 @@ sub build_cache_all {
} }
# EG new method for getting species IDs
sub get_species_ids {
my ($prefix) = @_;
my @speciesIds;
my $dsn =
"DBI:mysql:database="
. $conf->param("${prefix}dbname")
. ";host="
. $conf->param("${prefix}host")
. ";port="
. $conf->param("${prefix}port");
my $ensemblCoreDbh = DBI->connect( $dsn,
$conf->param("${prefix}user"),
$conf->param("${prefix}pass") )
|| die "Cannot connect to server: $DBI::errstr\n";
my $query = "SELECT DISTINCT meta_value, species_id
FROM meta WHERE meta_key = 'species.production_name'";
my $psmt = $ensemblCoreDbh->prepare($query);
$psmt->execute();
while ( my (@results) = $psmt->fetchrow() ) {
push @speciesIds, [ $results[0], $results[1] ];
}
return \@speciesIds;
} ## end sub get_species_ids
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment