Skip to content
Snippets Groups Projects
Commit be71ab17 authored by Andreas Kusalananda Kähäri's avatar Andreas Kusalananda Kähäri
Browse files

Forgot patching this with Dan's patch...

parent f4be3985
No related branches found
No related tags found
No related merge requests found
......@@ -137,18 +137,26 @@ sub build_cache_auto {
my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
inject($cache_impl);
my $cache = $cache_impl->new(
-LOGGER => $logger,
-CONF => $conf,
);
# EG - populate cache for each species in turn
$logger->debug("\nChecking number of toplevel seq_regions...\n");
my $max = 0;
my $max = 0;
my @species_ids = @{ get_species_ids("target") };
foreach my $dbtype (qw(source target)) {
my $num = scalar(@{ $cache->slice_names($dbtype) });
$max = $num if ($num > $max);
$logger->debug("$dbtype: $num.\n", 1);
# populate the cache for each species in turn
for my $species (@species_ids) {
$conf->param( 'species_id', $$species[1] );
$conf->param( 'species_name', $$species[0] );
my $cache = $cache_impl->new( -LOGGER => $logger,
-CONF => $conf, );
my $num =
scalar( @{ $cache->slice_names( $dbtype, @$species ) } );
$max = $num if ( $num > $max );
$logger->debug( "$dbtype: $num.\n", 1 );
}
}
my $threshold = $conf->param('build_cache_auto_threshold') || 100;
......@@ -177,19 +185,22 @@ sub build_cache_by_seq_region {
system("mkdir -p $logpath") == 0 or
$logger->error("Can't create lsf log dir $logpath: $!\n");
# EG get the list of species IDs for sources and targets
my @source_species_ids = @{ get_species_ids("source") };
my @species_ids = @{ get_species_ids("target") };
# load the cache implementation
my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
inject($cache_impl);
my $cache = $cache_impl->new(
-LOGGER => $logger,
-CONF => $conf,
);
# EG store the base directory onto which the species ID will be added
my $basedir = $conf->param('basedir');
# submit jobs to lsf
foreach my $dbtype (qw(source target)) {
$logger->info("\n".ucfirst($dbtype)." db...\n", 0, 'stamped');
# EG iterate over individual species for source and target
# determine which slices need to be done
my $filename = "$dbtype.dump_cache.slices.txt";
......@@ -197,21 +208,47 @@ sub build_cache_by_seq_region {
throw("Unable to open $logpath/$filename for writing: $!");
my $num_jobs = 0;
foreach my $slice_name (@{ $cache->slice_names($dbtype) }) {
my $type = "$dbtype.$slice_name";
unless ($cache->cache_file_exists($type)) {
print $fh "$slice_name\n";
$num_jobs++;
for my $species (@species_ids) {
# EG set config based on species ID in turn
$conf->param( 'basedir', path_append( $basedir, $$species[1] ) );
$conf->param( 'species_id', $$species[1] );
$conf->param( 'species_name', $$species[0] );
# EG load cache for current species ID
my $cache = $cache_impl->new( -LOGGER => $logger,
-CONF => $conf, );
foreach my $slice_name (
@{ $cache->slice_names( $dbtype, @$species ) } )
{
my $type = "$dbtype.$slice_name";
my $src_species_id;
for my $src_id (@source_species_ids) {
if ( $$species[1] == $$src_id[1] ) {
$src_species_id = $$src_id[1];
last;
}
}
$logger->info( "\n" . ucfirst($dbtype) . " db...\n",
0, 'stamped' );
foreach my $slice_name ( @{ $cache->slice_names($dbtype) } ) {
my $type = "$dbtype.$slice_name";
unless ( $cache->cache_file_exists($type) ) {
print $fh "$slice_name\n";
print $fh "$slice_name,$$species[0],$$species[1],"
. $src_species_id . "\n";
$num_jobs++;
}
}
}
}
unless ($num_jobs) {
$logger->info("All cache files for $dbtype exist.\n");
next;
}
} ## end for my $species (@species_ids)
close($fh);
unless ($num_jobs) {
$logger->info("All cache files for $dbtype exist.\n");
next;
}
# EG reset original basedir
$conf->param( 'basedir', $basedir );
# build lsf command
my $lsf_name = 'dump_by_seq_region_'.time;
......@@ -226,10 +263,17 @@ sub build_cache_by_seq_region {
cache_impl => $cache_impl,
);
my $cmd = qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX};
# EG invoke perl with correct path rather than relying on shebang
my $cmd =
qq{perl -I ./modules }
. qq{./misc-scripts/id_mapping/dump_by_seq_region.pl }
. qq{$options --index \$LSB_JOBINDEX};
my $pipe =
qq{|bsub -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
'|bsub '
. $conf->param('lsf_opt_run')
. qq{ -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
. qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out }
. qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err }
. $conf->param('lsf_opt_dump_cache');
......@@ -251,8 +295,10 @@ sub build_cache_by_seq_region {
# submit dependent job to monitor finishing of jobs
$logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true};
my $dependent_job =
qq{bsub -K -w "ended($lsf_name)" }
. $conf->param('lsf_opt_run_small')
. qq{ -o $logpath/dump_cache.$dbtype.depend.out /bin/true};
system($dependent_job) == 0 or
$logger->error("Error submitting dependent job: $!\n");
......@@ -306,3 +352,32 @@ sub build_cache_all {
}
# EG new method for getting species IDs
sub get_species_ids {
my ($prefix) = @_;
my @speciesIds;
my $dsn =
"DBI:mysql:database="
. $conf->param("${prefix}dbname")
. ";host="
. $conf->param("${prefix}host")
. ";port="
. $conf->param("${prefix}port");
my $ensemblCoreDbh = DBI->connect( $dsn,
$conf->param("${prefix}user"),
$conf->param("${prefix}pass") )
|| die "Cannot connect to server: $DBI::errstr\n";
my $query = "SELECT DISTINCT meta_value, species_id
FROM meta WHERE meta_key = 'species.production_name'";
my $psmt = $ensemblCoreDbh->prepare($query);
$psmt->execute();
while ( my (@results) = $psmt->fetchrow() ) {
push @speciesIds, [ $results[0], $results[1] ];
}
return \@speciesIds;
} ## end sub get_species_ids
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment