Skip to content
Snippets Groups Projects
Commit a7c35a55 authored by Andreas Kusalananda Kähäri's avatar Andreas Kusalananda Kähäri
Browse files

Backing out Dan's patches for release 59.

parent 590b6d63
No related branches found
No related tags found
No related merge requests found
......@@ -97,27 +97,6 @@ my $index = $conf->param('index');
my $logautobase = ($conf->param('logautobase') || 'dump_by_seq_region').
".$dbtype";
# EG - need to determine species before opening the cache and log
# determine which slice to process. To do so, read the file containing
# the slices to be processed, and take the one at position $index. This
# includes the species_id.
my $logpath = $conf->param('logpath');
my $filename = "$dbtype.dump_cache.slices.txt";
open( my $fh, '<', "$logpath/$filename" )
or throw("Unable to open $logpath/$filename for reading: $!");
my @slice_names = <$fh>;
my $slice_string = $slice_names[ $index - 1 ];
chomp($slice_string);
my ( $slice_name, $species_name, $species_id, $source_species_id ) =
split( ',', $slice_string );
close($fh);
$conf->param('basedir',path_append($conf->param('basedir'), $species_id));
$conf->param('species_id',$species_id);
$conf->param('species_name',$species_name);
# get log filehandle and print heading and parameters to logfile
my $logger = new Bio::EnsEMBL::Utils::Logger(
-LOGFILE => $conf->param('logfile'),
......@@ -139,6 +118,17 @@ my $cache = $cache_impl->new(
-CONF => $conf,
);
# determine which slice to process. to do so, read the file containing the
# slices to be processed, and take the one at position $index
my $logpath = $conf->param('logpath');
my $filename = "$dbtype.dump_cache.slices.txt";
open(my $fh, '<', "$logpath/$filename") or
throw("Unable to open $logpath/$filename for reading: $!");
my @slice_names = <$fh>;
my $slice_name = $slice_names[$index-1];
chomp($slice_name);
close($fh);
# no build the cache for this slice
$cache->build_cache_by_slice($dbtype, $slice_name);
......
......@@ -137,26 +137,18 @@ sub build_cache_auto {
my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
inject($cache_impl);
# EG - populate cache for each species in turn
$logger->debug("\nChecking number of toplevel seq_regions...\n");
my $max = 0;
my @species_ids = @{ get_species_ids("target") };
foreach my $dbtype (qw(source target)) {
# populate the cache for each species in turn
for my $species (@species_ids) {
$conf->param( 'species_id', $$species[1] );
$conf->param( 'species_name', $$species[0] );
my $cache = $cache_impl->new( -LOGGER => $logger,
-CONF => $conf, );
my $cache = $cache_impl->new(
-LOGGER => $logger,
-CONF => $conf,
);
my $num =
scalar( @{ $cache->slice_names( $dbtype, @$species ) } );
$logger->debug("\nChecking number of toplevel seq_regions...\n");
my $max = 0;
$max = $num if ( $num > $max );
$logger->debug( "$dbtype: $num.\n", 1 );
}
foreach my $dbtype (qw(source target)) {
my $num = scalar(@{ $cache->slice_names($dbtype) });
$max = $num if ($num > $max);
$logger->debug("$dbtype: $num.\n", 1);
}
my $threshold = $conf->param('build_cache_auto_threshold') || 100;
......@@ -185,144 +177,99 @@ sub build_cache_by_seq_region {
system("mkdir -p $logpath") == 0 or
$logger->error("Can't create lsf log dir $logpath: $!\n");
# EG get the list of species IDs for sources and targets
my @source_species_ids = @{ get_species_ids("source") };
my @species_ids = @{ get_species_ids("target") };
# load the cache implementation
my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
inject($cache_impl);
# EG store the base directory onto which the species ID will be added
my $basedir = $conf->param('basedir');
my $cache = $cache_impl->new(
-LOGGER => $logger,
-CONF => $conf,
);
# submit jobs to lsf
foreach my $dbtype (qw(source target)) {
# EG iterate over individual species for source and target
$logger->info("\n".ucfirst($dbtype)." db...\n", 0, 'stamped');
# determine which slices need to be done
my $filename = "$dbtype.dump_cache.slices.txt";
open( my $fh, '>', "$logpath/$filename" )
or throw("Unable to open $logpath/$filename for writing: $!");
open(my $fh, '>', "$logpath/$filename") or
throw("Unable to open $logpath/$filename for writing: $!");
my $num_jobs = 0;
for my $species (@species_ids) {
# EG set config based on species ID in turn
$conf->param( 'basedir', path_append( $basedir, $$species[1] ) );
$conf->param( 'species_id', $$species[1] );
$conf->param( 'species_name', $$species[0] );
# EG load cache for current species ID
my $cache = $cache_impl->new( -LOGGER => $logger,
-CONF => $conf, );
foreach my $slice_name (
@{ $cache->slice_names( $dbtype, @$species ) } )
{
my $type = "$dbtype.$slice_name";
my $src_species_id;
for my $src_id (@source_species_ids) {
if ( $$species[1] == $$src_id[1] ) {
$src_species_id = $$src_id[1];
last;
}
}
$logger->info( "\n" . ucfirst($dbtype) . " db...\n",
0, 'stamped' );
my $type = "$dbtype.$slice_name";
unless ( $cache->cache_file_exists($type) ) {
print $fh "$slice_name,$$species[0],$$species[1],"
. $src_species_id . "\n";
$num_jobs++;
}
}
unless ($num_jobs) {
$logger->info("All cache files for $dbtype exist.\n");
next;
foreach my $slice_name (@{ $cache->slice_names($dbtype) }) {
my $type = "$dbtype.$slice_name";
unless ($cache->cache_file_exists($type)) {
print $fh "$slice_name\n";
$num_jobs++;
}
}
} ## end for my $species (@species_ids)
close($fh);
# EG reset original basedir
$conf->param( 'basedir', $basedir );
unless ($num_jobs) {
$logger->info("All cache files for $dbtype exist.\n");
next;
}
# build lsf command
my $lsf_name = 'dump_by_seq_region_' . time;
my $lsf_name = 'dump_by_seq_region_'.time;
my $concurrent = $conf->param('build_cache_concurrent_jobs') || 200;
my $options =
$conf->create_commandline_options(
logauto => 1,
logautobase => "dump_by_seq_region",
interactive => 0,
is_component => 1,
dbtype => $dbtype,
cache_impl => $cache_impl, );
# EG invoke perl with correct path rather than relying on shebang
my $cmd = 'perl dump_by_seq_region.pl '
. qq{$options --index \$LSB_JOBINDEX};
my $pipe =
'|bsub '
. $conf->param('lsf_opt_run')
. qq{ -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
. qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out }
. qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err }
. $conf->param('lsf_opt_dump_cache');
my $options = $conf->create_commandline_options(
logauto => 1,
logautobase => "dump_by_seq_region",
interactive => 0,
is_component => 1,
dbtype => $dbtype,
cache_impl => $cache_impl,
);
# run lsf job array
$logger->info("\nSubmitting $num_jobs jobs to lsf.\n");
my $cmd = qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX};
if ( $num_jobs > 0 ) {
$logger->debug("$cmd\n\n");
$logger->debug("$pipe\n\n");
my $pipe = qq{|bsub -J$lsf_name\[1-$num_jobs\]\%$concurrent } .
qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out } .
qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err } .
$conf->param('lsf_opt_dump_cache');
local *BSUB;
open BSUB, $pipe
or $logger->error("Could not open open pipe to bsub: $!\n");
# run lsf job array
$logger->info("\nSubmitting $num_jobs jobs to lsf.\n");
$logger->debug("$cmd\n\n");
print BSUB $cmd;
$logger->error("Error submitting jobs: $!\n")
unless ( $? == 0 );
close BSUB;
local *BSUB;
open BSUB, $pipe or
$logger->error("Could not open open pipe to bsub: $!\n");
# submit dependent job to monitor finishing of jobs
$logger->info( "Waiting for jobs to finish...\n", 0, 'stamped' );
print BSUB $cmd;
$logger->error("Error submitting jobs: $!\n")
unless ($? == 0);
close BSUB;
my $dependent_job =
qq{bsub -K -w "ended($lsf_name)" }
. $conf->param('lsf_opt_run_small')
. qq{ -o $logpath/dump_cache.$dbtype.depend.out /bin/true};
# submit dependent job to monitor finishing of jobs
$logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
system($dependent_job) == 0
or $logger->error("Error submitting dependent job: $!\n");
my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true};
$logger->info( "All jobs finished.\n", 0, 'stamped' );
system($dependent_job) == 0 or
$logger->error("Error submitting dependent job: $!\n");
sleep(5);
} ## end if ( $num_jobs > 0 )
$logger->info("All jobs finished.\n", 0, 'stamped');
# check for lsf errors
sleep(5);
my $err;
foreach my $i ( 1 .. $num_jobs ) {
if ( !-e "$logpath/dump_by_seq_region.$dbtype.$i.success" ) {
$err++;
}
foreach my $i (1..$num_jobs) {
$err++ unless (-e "$logpath/dump_by_seq_region.$dbtype.$i.success");
}
if ( $err > 0 ) {
$logger->error( "At least one of your jobs failed.\n"
. "Please check the logfiles at $logpath for errors.\n" );
if ($err) {
$logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
return 1;
}
} ## end foreach my $dbtype (qw(source target))
}
return 0;
}
......@@ -357,32 +304,3 @@ sub build_cache_all {
}
# EG new method for getting species IDs
sub get_species_ids {
my ($prefix) = @_;
my @speciesIds;
my $dsn =
"DBI:mysql:database="
. $conf->param("${prefix}dbname")
. ";host="
. $conf->param("${prefix}host")
. ";port="
. $conf->param("${prefix}port");
my $ensemblCoreDbh = DBI->connect( $dsn,
$conf->param("${prefix}user"),
$conf->param("${prefix}pass") )
|| die "Cannot connect to server: $DBI::errstr\n";
my $query = "SELECT DISTINCT meta_value, species_id
FROM meta WHERE meta_key = 'species.production_name'";
my $psmt = $ensemblCoreDbh->prepare($query);
$psmt->execute();
while ( my (@results) = $psmt->fetchrow() ) {
push @speciesIds, [ $results[0], $results[1] ];
}
return \@speciesIds;
} ## end sub get_species_ids
......@@ -56,8 +56,7 @@ no warnings 'uninitialized';
use FindBin qw($Bin);
use Bio::EnsEMBL::Utils::ConfParser;
use Bio::EnsEMBL::Utils::Logger;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append inject);
use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
use Bio::EnsEMBL::IdMapping::Cache;
use Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
use Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
......@@ -78,38 +77,26 @@ my $conf = new Bio::EnsEMBL::Utils::ConfParser(
);
$conf->parse_options(
'mode=s' => 0,
'basedir|basedir=s' => 1,
'chromosomes|chr=s@' => 0,
'region=s' => 0,
'biotypes=s@' => 0,
'min_exon_length|minexonlength=i' => 0,
'exonerate_path|exoneratepath=s' => 1,
'exonerate_threshold|exoneratethreshold=f' => 0,
'exonerate_jobs|exoneratejobs=i' => 0,
'mode=s' => 0,
'basedir|basedir=s' => 1,
'chromosomes|chr=s@' => 0,
'region=s' => 0,
'biotypes=s@' => 0,
'min_exon_length|minexonlength=i' => 0,
'exonerate_path|exoneratepath=s' => 1,
'exonerate_threshold|exoneratethreshold=f' => 0,
'exonerate_jobs|exoneratejobs=i' => 0,
'exonerate_bytes_per_job|exoneratebytesperjob=f' => 0,
'exonerate_extra_params|exonerateextraparams=s' => 0,
'plugin_internal_id_mappers_gene=s@' => 0,
'plugin_internal_id_mappers_transcript=s@' => 0,
'plugin_internal_id_mappers_exon=s@' => 0,
'mapping_types=s@' => 1,
'plugin_stable_id_generator=s' => 0,
'upload_events|uploadevents=s' => 0,
'upload_stable_ids|uploadstableids=s' => 0,
'upload_archive|uploadarchive=s' => 0,
# EG allow additional configs to be set on command line
'sourcedbname=s' => 1,
'sourcehost=s' => 1,
'sourceuser=s' => 1,
'sourcepass=s' => 0,
'sourceport=i' => 1,
'targetdbname=s' => 1,
'targethost=s' => 1,
'targetuser=s' => 1,
'targetpass=s' => 0,
'targetport=i' => 1,
'species_id' => 0,
'species_name' => 0 );
'exonerate_extra_params|exonerateextraparams=s' => 0,
'plugin_internal_id_mappers_gene=s@' => 0,
'plugin_internal_id_mappers_transcript=s@' => 0,
'plugin_internal_id_mappers_exon=s@' => 0,
'mapping_types=s@' => 1,
'plugin_stable_id_generator=s' => 0,
'upload_events|uploadevents=s' => 0,
'upload_stable_ids|uploadstableids=s' => 0,
'upload_archive|uploadarchive=s' => 0,
);
# set default logpath
unless ($conf->param('logpath')) {
......@@ -144,109 +131,35 @@ my $transcript_mappings;
my $gene_mappings;
my $translation_mappings;
# EG mapping code reworked to iterate over different species
# loading cache from file
my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
-LOGGER => $logger,
-CONF => $conf,
-LOAD_INSTANCE => 1,
);
# get a stable ID mapper
my $stable_id_mapper = Bio::EnsEMBL::IdMapping::StableIdMapper->new(
-LOGGER => $logger,
-CONF => $conf,
-CACHE => $cache
);
# find out which entities we want to map
my %mapping_types = ();
foreach my $type ( $conf->param('mapping_types') ) {
foreach my $type ($conf->param('mapping_types')) {
$mapping_types{$type} = 1;
}
# EG get list of species
my @species_ids = @{ get_species_ids( "target", $conf ) };
my $basedir = $conf->param('basedir');
# EG create placeholder cache based on the first species to allow us to
# create a stable ID generator to reuse for all species
my $stable_id_mapper;
my $s = $species_ids[0];
$conf->param( 'basedir', path_append( $basedir, $$s[1] ) );
$conf->param( 'species_id', $$s[1] );
$conf->param( 'species_name', $$s[0] );
# loading cache from file
my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
-LOGGER => $logger,
-CONF => $conf,
-LOAD_INSTANCE => 1,
-SPECIES_ID => $$s[1], # EG
-SPECIES_NAME => $$s[0] # EG
);
my $type_count_sql = { gene => 'select count(*) '
. 'from gene_stable_id '
. 'join gene using (gene_id) '
. 'join seq_region using (seq_region_id) '
. 'join coord_system using (coord_system_id) '
. 'where species_id=?',
transcript => 'select count(*) '
. 'from transcript_stable_id '
. 'join transcript using (transcript_id) '
. 'join seq_region using (seq_region_id) '
. 'join coord_system using (coord_system_id) '
. 'where species_id=?',
exon => 'select count(*) '
. 'from exon_stable_id '
. 'join exon using (exon_id) '
. 'join seq_region using (seq_region_id) '
. 'join coord_system using (coord_system_id) '
. 'where species_id=?',
translation => 'select count(*) '
. 'from translation_stable_id '
. 'join translation using (translation_id) '
. 'join transcript using (transcript_id) '
. 'join seq_region using (seq_region_id) '
. 'join coord_system using (coord_system_id) '
. 'where species_id=?' };
# eg get the stable id generator and inject the cache into it
my $stable_id_generator = $conf->param('plugin_stable_id_generator')
|| 'Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric';
inject($stable_id_generator);
# create a new stableidgenerator object
my $generator_instance =
$stable_id_generator->new( -logger => $logger,
-conf => $conf,
-cache => $cache );
# EG iterate over species
for my $species (@species_ids) {
# EG create a new cache for the current species
print( "Handling species " . $$species[0] . "/" . $$species[1],
"\n" );
$conf->param( 'basedir', path_append( $basedir, $$species[1] ) );
$conf->param( 'species_id', $$species[1] );
$conf->param( 'species_name', $$species[0] );
$cache =
Bio::EnsEMBL::IdMapping::Cache->new( -LOGGER => $logger,
-CONF => $conf,
-LOAD_INSTANCE => 1 );
# get a stable ID mapper
$stable_id_mapper =
Bio::EnsEMBL::IdMapping::StableIdMapper->new( -LOGGER => $logger,
-CONF => $conf,
-CACHE => $cache );
# EG replace the genny cache with the species specific one
$generator_instance->cache($cache);
$stable_id_mapper->stable_id_generator($generator_instance);
##? # find out which entities we want to map
##? my %mapping_types = ();
##? foreach my $type ( $conf->param('mapping_types') ) {
##? $mapping_types{$type} = 1;
##? }
# run in requested mode
my $mode = $conf->param('mode') || 'normal';
if ( $mode eq 'mapping' ) { $mode = 'normal' }
my $run = "run_$mode";
no strict 'refs';
&$run;
} ## end for my $species (@species_ids)
# run in requested mode
my $mode = $conf->param('mode') || 'normal';
if ( $mode eq 'mapping' ) { $mode = 'normal' }
my $run = "run_$mode";
no strict 'refs';
&$run;
# finish logfile
......@@ -474,73 +387,49 @@ sub archive {
sub upload_mapping_session_and_events {
if ( $conf->is_true('upload_events') and !$conf->param('dry_run') ) {
$logger->info(
"Uploading mapping_session and stable_id_event tables...\n");
if ($conf->is_true('upload_events') and ! $conf->param('dry_run')) {
$logger->info("Uploading mapping_session and stable_id_event tables...\n");
my $i = 0;
my $j = 0;
$logger->info( "mapping_session...\n", 1 );
$i +=
$stable_id_mapper->upload_file_into_table( 'target',
'mapping_session', 'mapping_session.txt',
$conf->param('species_id') > 1 ? 1 : 0 );
$logger->info( "$i\n", 1 );
$logger->info( "stable_id_event...\n", 1 );
$j += $stable_id_mapper->upload_file_into_table( 'target',
'stable_id_event', 'stable_id_event_existing.txt' );
$j += $stable_id_mapper->upload_file_into_table( 'target',
'stable_id_event', 'stable_id_event_new.txt', 1 );
$j += $stable_id_mapper->upload_file_into_table( 'target',
'stable_id_event', 'stable_id_event_similarity.txt', 1 );
$logger->info( "$j\n", 1 );
$logger->info("mapping_session...\n", 1);
$i += $stable_id_mapper->upload_file_into_table('target', 'mapping_session',
'mapping_session.txt');
$logger->info("$i\n", 1);
$logger->info("stable_id_event...\n", 1);
$j += $stable_id_mapper->upload_file_into_table('target', 'stable_id_event',
'stable_id_event_existing.txt');
$j += $stable_id_mapper->upload_file_into_table('target', 'stable_id_event',
'stable_id_event_new.txt', 1);
$j += $stable_id_mapper->upload_file_into_table('target', 'stable_id_event',
'stable_id_event_similarity.txt', 1);
$logger->info("$j\n", 1);
$logger->info("Done.\n\n");
} else {
$logger->info(
"Stable ID event and mapping session tables not uploaded.\n\n");
$logger->info("Stable ID event and mapping session tables not uploaded.\n\n");
}
} ## end sub upload_mapping_session_and_events
}
sub upload_stable_ids {
if ( $conf->is_true('upload_stable_ids')
&& !$conf->param('dry_run') )
{
my $species_id = $conf->param('species_id');
if ($conf->is_true('upload_stable_ids') and ! $conf->param('dry_run')) {
$logger->info("Uploading stable ID tables...\n");
foreach my $t ( $conf->param('mapping_types') ) {
$logger->info( "${t}_stable_id...\n", 1 );
# EG check if empty for species
my $cnt = get_stable_id_count( $t, 'target', $species_id );
if ( $cnt > 0 ) {
$logger->warning( "Existing stable IDs found "
. "for $t for species ID $species_id "
. "- not uploading",
1 );
$logger->info( "Data not uploaded!\n", 1 );
} else {
my $i = $stable_id_mapper->upload_file_into_table( 'target',
"${t}_stable_id", "${t}_stable_id.txt", 1 );
$logger->info( "$i\n", 1 );
}
foreach my $t ($conf->param('mapping_types')) {
$logger->info("${t}_stable_id...\n", 1);
my $i = $stable_id_mapper->upload_file_into_table('target',
"${t}_stable_id", "${t}_stable_id.txt");
$logger->info("$i\n", 1);
}
$logger->info("Done.\n\n");
} else {
$logger->info("Stable ID tables not uploaded.\n\n");
}
......@@ -650,61 +539,4 @@ sub log_cache_stats {
}
}
sub get_stable_id_count {
my ( $type, $dbtype, $species_id ) = @_;
# EG new subroutine for finding if stable ID table has entries for
# this species already
my $dba = $cache->get_DBAdaptor($dbtype);
my $dbh = $dba->dbc->db_handle;
# check table is empty
my $sql = $type_count_sql->{$type};
if ( !defined($sql) ) {
throw("Cannot count stable ids of type $type");
}
my $sth = $dbh->prepare($sql);
$sth->execute($species_id);
my ($c) = $sth->fetchrow_array;
$sth->finish;
return $c;
}
sub get_species_ids {
my ( $prefix, $conf ) = @_;
# EG additional subroutine for determining which species IDs are
# present
my @speciesIds;
my $dsn =
"DBI:mysql:database="
. $conf->param("${prefix}dbname")
. ";host="
. $conf->param("${prefix}host")
. ";port="
. $conf->param("${prefix}port");
my $ensemblCoreDbh = DBI->connect( $dsn,
$conf->param("${prefix}user"),
$conf->param("${prefix}pass") )
|| die "Cannot connect to server: $DBI::errstr\n";
my $query = "SELECT DISTINCT meta_value, species_id
FROM meta WHERE meta_key = 'species.production_name'";
my $psmt = $ensemblCoreDbh->prepare($query);
$psmt->execute();
while ( my (@results) = $psmt->fetchrow() ) {
push @speciesIds, [ $results[0], $results[1] ];
}
return \@speciesIds;
} ## end sub get_species_ids
......@@ -52,6 +52,6 @@ if [[ ${conf#/} == ${conf} ]]; then
exit
fi
./run.pl --lsf --conf=${conf} --mode=${mode} --logauto
./run.pl --lsf --conf=${conf} --logauto --mode=${mode}
# $Id$
......@@ -67,23 +67,12 @@ my $conf = new Bio::EnsEMBL::Utils::ConfParser(
-DEFAULT_CONF => "$Bin/default.conf"
);
$conf->parse_options( 'basedir|basedir=s' => 1,
'index|i=n' => 1,
'chromosomes|chr=s@' => 0,
'region=s' => 0,
'species_id=i' => 1,
'species_name=s' => 1,
'sourcedbname=s' => 1,
'sourcehost=s' => 1,
'sourceuser=s' => 1,
'sourcepass=s' => 0,
'sourceport=i' => 1,
'targetdbname=s' => 1,
'targethost=s' => 1,
'targetuser=s' => 1,
'targetpass=s' => 0,
'targetport=i' => 1 );
$conf->parse_options(
'basedir|basedir=s' => 1,
'index|i=n' => 1,
'chromosomes|chr=s@' => 0,
'region=s' => 0,
);
# append job index to logfile name
my $index = $conf->param('index');
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment