From b17d8adb4309c291e59b51245d888252fe0266ff Mon Sep 17 00:00:00 2001 From: Patrick Meidl <pm2@sanger.ac.uk> Date: Thu, 26 Oct 2006 10:35:14 +0000 Subject: [PATCH] score building, first full draft --- misc-scripts/id_mapping/default.conf | 32 +++++++++++------- misc-scripts/id_mapping/dump_by_seq_region.pl | 5 +++ misc-scripts/id_mapping/dump_cache.pl | 23 ++++++++----- misc-scripts/id_mapping/id_mapping.pl | 33 +++++++++++-------- misc-scripts/id_mapping/run_all.pl | 13 ++++++-- 5 files changed, 70 insertions(+), 36 deletions(-) diff --git a/misc-scripts/id_mapping/default.conf b/misc-scripts/id_mapping/default.conf index f3a3e09a08..df630e2dcd 100644 --- a/misc-scripts/id_mapping/default.conf +++ b/misc-scripts/id_mapping/default.conf @@ -5,22 +5,30 @@ dry_run = 1 verbose = 1 ; prepend this path to your 'log' parameter -logpath = /ecs2/work2/pm2/logs/test +logpath = /ecs2/work2/pm2/logs/test_20060926 ; old db settings -oldhost = ecs4 -oldport = 3350 -olduser = ensro -olddbname = pm2_homo_sapiens_core_39_36a +sourcehost = ecs2 +sourceport = 3364 +sourceuser = ensro +sourcedbname = homo_sapiens_core_41_36c ; new db settings -newhost = ecs4 -newport = 3350 -newuser = ensadmin -newpass = ensembl -newdbname = pm2_homo_sapiens_core_39_36a +targethost = ecs2 +targetport = 3364 +targetuser = ensro +targetdbname = homo_sapiens_core_41_36c ; cache -dumppath = /ecs2/scratch3/pm2/data/test -;region = chromosome:NCBI36:X:1000000:2000000:1 +dumppath = /ecs2/scratch3/pm2/data/test_20060926 +;region = chromosome:NCBI36:X:1:2000000:1 + +; ScoreBuilder +min_exon_length = 15 +exonerate_path = /usr/local/ensembl/bin/exonerate-0.8.2 +exonerate_threshold = 0.5 +exonerate_jobs = 0 +exonerate_bytes_per_job = 250000 + +transcript_score_threshold = 0 diff --git a/misc-scripts/id_mapping/dump_by_seq_region.pl b/misc-scripts/id_mapping/dump_by_seq_region.pl index b0742a0b0c..6d02f75888 100755 --- a/misc-scripts/id_mapping/dump_by_seq_region.pl +++ b/misc-scripts/id_mapping/dump_by_seq_region.pl @@ -149,6 +149,11 @@ my $i = 0; my $size = 0; ($i, $size) = $cache->build_cache($dbtype, $slice_name); +# set flag to indicate everything went fine +my $success_file = $conf->param('logpath')."/lsf/dump_by_seq_region.$dbtype.$slice_name.success"; +open(TMPFILE, '>', $success_file) and close TMPFILE + or die "Can't open $success_file for writing: $!"; + # log success $logger->log("Done with $dbtype $slice_name (genes: $i, filesize: $size, runtime: ".$logger->runtime." ".$logger->date_and_mem."\n"); diff --git a/misc-scripts/id_mapping/dump_cache.pl b/misc-scripts/id_mapping/dump_cache.pl index ab4895654d..a544997f15 100755 --- a/misc-scripts/id_mapping/dump_cache.pl +++ b/misc-scripts/id_mapping/dump_cache.pl @@ -31,11 +31,11 @@ Optional arguments: =head1 DESCRIPTION -Use --sourceschema and --targetschema to specify a schema version (default: latest). -This will be used to determine the subroutine to build the cache. By default, -&build_cache_latest() is run which uses Bio::EnsEMBL::IdMapping::Cache to read -from the database and write the cache. An alternative subroutine can use a -different module for that, which will usually inherit from the former and +Use --sourceschema and --targetschema to specify a schema version (default: +latest). This will be used to determine the subroutine to build the cache. By +default, &build_cache_latest() is run which uses Bio::EnsEMBL::IdMapping::Cache +to read from the database and write the cache. An alternative subroutine can +use a different module for that, which will usually inherit from the former and overwrite Cache->build_cache(). This is useful for backwards compatibility with older schema versions. Once the cache is built, no API access is needed, therefore the ID mapping application is independent of the underlying database @@ -144,10 +144,10 @@ $conf->check_required_params( my %jobs; # create empty directory for logs -my $logpath = ($conf->param('logpath')||$conf->param('dumppath')).'/lsf'; +my $logpath = ($conf->param('logpath')||$conf->param('dumppath')).'/lsf_dump_cache'; system("rm -rf $logpath") == 0 or $logger->log_error("Unable to delete lsf log dir $logpath: $!\n"); -system("mkdir $logpath") == 0 or +system("mkdir -p $logpath") == 0 or $logger->log_error("Can't create lsf log dir $logpath: $!\n"); # submit jobs to lsf @@ -164,6 +164,7 @@ foreach my $dbtype (qw(source target)) { # monitor progress my $err; my $total = scalar(keys %jobs); +my @types; while (keys %jobs) { foreach my $type (keys %jobs) { @@ -174,6 +175,7 @@ while (keys %jobs) { # the job has finished if you find the error logfile delete($jobs{$type}) if (-e $err_log); + push @types, $type; } $logger->log("Jobs waiting: ".scalar(keys %jobs)."/$total.\r"); @@ -184,10 +186,14 @@ while (keys %jobs) { $logger->log("\n\n"); # check if anything went wrong +foreach my $type (@types) { + $err++ unless (-e "$logpath/dump_by_seq_region.$type.success"); +} + my $retval = 0; if ($err) { $logger->log("At least one of your jobs failed.\n"); - $logger->log("Please check $logpath for errors.\n"); + $logger->log("Please check $logpath and ".$conf->param('logpath')."/dump_by_seq_region.log for errors.\n"); $retval = 1; } @@ -243,6 +249,7 @@ sub bsubmit { dbtype => $dbtype, slice_name => $slice_name, cache_impl => ref($cache), + log_append => 1, }, -EXCLUDE => [qw(region chromosomes)] ); diff --git a/misc-scripts/id_mapping/id_mapping.pl b/misc-scripts/id_mapping/id_mapping.pl index 4604aaa254..6ebc109de4 100755 --- a/misc-scripts/id_mapping/id_mapping.pl +++ b/misc-scripts/id_mapping/id_mapping.pl @@ -67,6 +67,7 @@ use Bio::EnsEMBL::Utils::ConfParser; use Bio::EnsEMBL::Utils::Logger; use Bio::EnsEMBL::IdMapping::Cache; use Bio::EnsEMBL::IdMapping::ExonScoreBuilder; +use Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder; #use Devel::Size qw(size total_size); #use Data::Dumper; @@ -88,13 +89,19 @@ $conf->parse_extra_options(qw( chromosomes|chr=s@ region=s biotypes=s@ + min_exon_length|minexonlength=i + exonerate_path|exoneratepath=s + exonerate_threshold|exoneratethreshold=i + exonerate_jobs|exoneratejobs=i + exonerate_bytes_per_job|exoneratebytesperjob=i )); $conf->allowed_params( $conf->get_common_params, qw( - mode - dumppath + mode dumppath chromosomes region biotypes + min_exon_length + exonerate_path exonerate_threshold exonerate_jobs exonerate_byte_per_job ) ); @@ -122,6 +129,7 @@ $logger->init_log($conf->list_all_params); $conf->check_required_params( qw( dumppath + exonerate_path ) ); @@ -137,21 +145,14 @@ my $cache = Bio::EnsEMBL::IdMapping::Cache->new( -LOGGER => $logger, -CONF => $conf, ); - -foreach my $dbtype (qw(source target)) { - foreach my $slice_name (@{ $cache->slice_names($dbtype) }) { - $logger->log("\n"); - $cache->read_from_file('exons_by_id', "$dbtype.$slice_name"); - } -} - -$cache->merge('exons_by_id'); +$cache->read_instance_from_file; # run in requested mode my $mode = $conf->param('mode') || 'normal'; +my $run = "run_$mode"; no strict 'refs'; -&run_$mode; +&$run; # finish logfile @@ -173,11 +174,15 @@ sub build_scores { -CONF => $conf, -CACHE => $cache ); - #my $tsb = Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder->new( + my $tsb = Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder->new( + -LOGGER => $logger, + -CONF => $conf, + -CACHE => $cache + ); #my $gsb = Bio::EnsEMBL::IdMapping::GeneScoreBuilder->new( $exon_scores = $esb->score_exons; - #$transcript_scores = $tsb->score_transcripts; + $transcript_scores = $tsb->score_transcripts; #$gene_scores = $gsb->score_genes; } diff --git a/misc-scripts/id_mapping/run_all.pl b/misc-scripts/id_mapping/run_all.pl index 5a0218b3e1..96791b2a07 100755 --- a/misc-scripts/id_mapping/run_all.pl +++ b/misc-scripts/id_mapping/run_all.pl @@ -92,6 +92,11 @@ $conf->parse_extra_options(qw( chromosomes|chr=s@ region=s biotypes=s@ + min_exon_length|minexonlength=i + exonerate_path|exoneratepath=s + exonerate_threshold|exoneratethreshold=i + exonerate_jobs|exoneratejobs=i + exonerate_bytes_per_job|exoneratebytesperjob=i )); $conf->allowed_params( $conf->get_common_params, @@ -101,6 +106,8 @@ $conf->allowed_params( mode dumppath cachefile chromosomes region biotypes + min_exon_length + exonerate_path exonerate_threshold exonerate_jobs exonerate_byte_per_job ) ); @@ -141,7 +148,8 @@ $options{'dump_cache'} = $conf->create_commandline_options( interactive => 0, is_component => 1, }, - -EXCLUDE => [qw(mode)] + -EXCLUDE => [qw(mode min_exon_length exonerate_path exonerate_threshold + exonerate_jobs exonerate_byte_per_job)] ); $options{'id_mapping'} = $conf->create_commandline_options( @@ -161,8 +169,9 @@ $options{'id_mapping'} = $conf->create_commandline_options( # run components, depending on mode my $mode = $conf->param('mode') || 'normal'; +my $sub = "run_$mode"; no strict 'refs'; -&run_$mode; +&$sub; # finish logfile -- GitLab