From b17d8adb4309c291e59b51245d888252fe0266ff Mon Sep 17 00:00:00 2001
From: Patrick Meidl <pm2@sanger.ac.uk>
Date: Thu, 26 Oct 2006 10:35:14 +0000
Subject: [PATCH] score building, first full draft

---
 misc-scripts/id_mapping/default.conf          | 32 +++++++++++-------
 misc-scripts/id_mapping/dump_by_seq_region.pl |  5 +++
 misc-scripts/id_mapping/dump_cache.pl         | 23 ++++++++-----
 misc-scripts/id_mapping/id_mapping.pl         | 33 +++++++++++--------
 misc-scripts/id_mapping/run_all.pl            | 13 ++++++--
 5 files changed, 70 insertions(+), 36 deletions(-)

diff --git a/misc-scripts/id_mapping/default.conf b/misc-scripts/id_mapping/default.conf
index f3a3e09a08..df630e2dcd 100644
--- a/misc-scripts/id_mapping/default.conf
+++ b/misc-scripts/id_mapping/default.conf
@@ -5,22 +5,30 @@ dry_run = 1
 verbose = 1
 
 ; prepend this path to your 'log' parameter
-logpath = /ecs2/work2/pm2/logs/test
+logpath = /ecs2/work2/pm2/logs/test_20060926
 
 ; old db settings
-oldhost                 = ecs4
-oldport                 = 3350
-olduser                 = ensro
-olddbname               = pm2_homo_sapiens_core_39_36a
+sourcehost                  = ecs2
+sourceport                  = 3364
+sourceuser                  = ensro
+sourcedbname                = homo_sapiens_core_41_36c
 
 ; new db settings
-newhost                 = ecs4
-newport                 = 3350
-newuser                 = ensadmin
-newpass                 = ensembl
-newdbname               = pm2_homo_sapiens_core_39_36a
+targethost                  = ecs2
+targetport                  = 3364
+targetuser                  = ensro
+targetdbname                = homo_sapiens_core_41_36c
 
 ; cache
-dumppath                = /ecs2/scratch3/pm2/data/test
-;region                  = chromosome:NCBI36:X:1000000:2000000:1
+dumppath                    = /ecs2/scratch3/pm2/data/test_20060926
+;region                     = chromosome:NCBI36:X:1:2000000:1
+
+; ScoreBuilder
+min_exon_length             = 15
+exonerate_path              = /usr/local/ensembl/bin/exonerate-0.8.2
+exonerate_threshold         = 0.5
+exonerate_jobs              = 0
+exonerate_bytes_per_job     = 250000
+
+transcript_score_threshold  = 0
 
diff --git a/misc-scripts/id_mapping/dump_by_seq_region.pl b/misc-scripts/id_mapping/dump_by_seq_region.pl
index b0742a0b0c..6d02f75888 100755
--- a/misc-scripts/id_mapping/dump_by_seq_region.pl
+++ b/misc-scripts/id_mapping/dump_by_seq_region.pl
@@ -149,6 +149,11 @@ my $i = 0;
 my $size = 0;
 ($i, $size) = $cache->build_cache($dbtype, $slice_name);
 
+# set flag to indicate everything went fine
+my $success_file = $conf->param('logpath')."/lsf/dump_by_seq_region.$dbtype.$slice_name.success";
+open(TMPFILE, '>', $success_file) and close TMPFILE
+  or die "Can't open $success_file for writing: $!";
+
 # log success
 $logger->log("Done with $dbtype $slice_name (genes: $i, filesize: $size, runtime: ".$logger->runtime." ".$logger->date_and_mem."\n");
 
diff --git a/misc-scripts/id_mapping/dump_cache.pl b/misc-scripts/id_mapping/dump_cache.pl
index ab4895654d..a544997f15 100755
--- a/misc-scripts/id_mapping/dump_cache.pl
+++ b/misc-scripts/id_mapping/dump_cache.pl
@@ -31,11 +31,11 @@ Optional arguments:
 
 =head1 DESCRIPTION
 
-Use --sourceschema and --targetschema to specify a schema version (default: latest).
-This will be used to determine the subroutine to build the cache. By default,
-&build_cache_latest() is run which uses Bio::EnsEMBL::IdMapping::Cache to read
-from the database and write the cache.  An alternative subroutine can use a
-different module for that, which will usually inherit from the former and
+Use --sourceschema and --targetschema to specify a schema version (default:
+latest). This will be used to determine the subroutine to build the cache. By
+default, &build_cache_latest() is run which uses Bio::EnsEMBL::IdMapping::Cache
+to read from the database and write the cache.  An alternative subroutine can
+use a different module for that, which will usually inherit from the former and
 overwrite Cache->build_cache(). This is useful for backwards compatibility with
 older schema versions. Once the cache is built, no API access is needed,
 therefore the ID mapping application is independent of the underlying database
@@ -144,10 +144,10 @@ $conf->check_required_params(
 my %jobs;
 
 # create empty directory for logs
-my $logpath = ($conf->param('logpath')||$conf->param('dumppath')).'/lsf';
+my $logpath = ($conf->param('logpath')||$conf->param('dumppath')).'/lsf_dump_cache';
 system("rm -rf $logpath") == 0 or
   $logger->log_error("Unable to delete lsf log dir $logpath: $!\n");
-system("mkdir $logpath") == 0 or
+system("mkdir -p $logpath") == 0 or
   $logger->log_error("Can't create lsf log dir $logpath: $!\n");
 
 # submit jobs to lsf
@@ -164,6 +164,7 @@ foreach my $dbtype (qw(source target)) {
 # monitor progress
 my $err;
 my $total = scalar(keys %jobs);
+my @types;
 
 while (keys %jobs) {
   foreach my $type (keys %jobs) {
@@ -174,6 +175,7 @@ while (keys %jobs) {
 
     # the job has finished if you find the error logfile
     delete($jobs{$type}) if (-e $err_log);
+    push @types, $type;
   }
 
   $logger->log("Jobs waiting: ".scalar(keys %jobs)."/$total.\r");
@@ -184,10 +186,14 @@ while (keys %jobs) {
 $logger->log("\n\n");
 
 # check if anything went wrong
+foreach my $type (@types) {
+  $err++ unless (-e "$logpath/dump_by_seq_region.$type.success");
+}
+
 my $retval = 0;
 if ($err) {
   $logger->log("At least one of your jobs failed.\n");
-  $logger->log("Please check $logpath for errors.\n");
+  $logger->log("Please check $logpath and ".$conf->param('logpath')."/dump_by_seq_region.log for errors.\n");
   $retval = 1;
 }
 
@@ -243,6 +249,7 @@ sub bsubmit {
             dbtype        => $dbtype,
             slice_name    => $slice_name,
             cache_impl    => ref($cache),
+            log_append    => 1,
         },
         -EXCLUDE => [qw(region chromosomes)]
     );
diff --git a/misc-scripts/id_mapping/id_mapping.pl b/misc-scripts/id_mapping/id_mapping.pl
index 4604aaa254..6ebc109de4 100755
--- a/misc-scripts/id_mapping/id_mapping.pl
+++ b/misc-scripts/id_mapping/id_mapping.pl
@@ -67,6 +67,7 @@ use Bio::EnsEMBL::Utils::ConfParser;
 use Bio::EnsEMBL::Utils::Logger;
 use Bio::EnsEMBL::IdMapping::Cache;
 use Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
+use Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
 
 #use Devel::Size qw(size total_size);
 #use Data::Dumper;
@@ -88,13 +89,19 @@ $conf->parse_extra_options(qw(
   chromosomes|chr=s@
   region=s
   biotypes=s@
+  min_exon_length|minexonlength=i
+  exonerate_path|exoneratepath=s
+  exonerate_threshold|exoneratethreshold=i
+  exonerate_jobs|exoneratejobs=i
+  exonerate_bytes_per_job|exoneratebytesperjob=i
 ));
 $conf->allowed_params(
   $conf->get_common_params,
   qw(
-    mode
-    dumppath
+    mode dumppath
     chromosomes region biotypes
+    min_exon_length
+    exonerate_path exonerate_threshold exonerate_jobs exonerate_byte_per_job
   )
 );
 
@@ -122,6 +129,7 @@ $logger->init_log($conf->list_all_params);
 $conf->check_required_params(
   qw(
     dumppath
+    exonerate_path
   )
 );
 
@@ -137,21 +145,14 @@ my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
   -LOGGER       => $logger,
   -CONF         => $conf,
 );
-
-foreach my $dbtype (qw(source target)) {
-  foreach my $slice_name (@{ $cache->slice_names($dbtype) }) {
-    $logger->log("\n");
-    $cache->read_from_file('exons_by_id', "$dbtype.$slice_name");
-  }
-}
-
-$cache->merge('exons_by_id');
+$cache->read_instance_from_file;
 
 
 # run in requested mode
 my $mode = $conf->param('mode') || 'normal';
+my $run = "run_$mode";
 no strict 'refs';
-&run_$mode;
+&$run;
 
 
 # finish logfile
@@ -173,11 +174,15 @@ sub build_scores {
     -CONF         => $conf,
     -CACHE        => $cache
   );
-  #my $tsb = Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder->new(
+  my $tsb = Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder->new(
+    -LOGGER       => $logger,
+    -CONF         => $conf,
+    -CACHE        => $cache
+  );
   #my $gsb = Bio::EnsEMBL::IdMapping::GeneScoreBuilder->new(
 
   $exon_scores = $esb->score_exons;
-  #$transcript_scores = $tsb->score_transcripts;
+  $transcript_scores = $tsb->score_transcripts;
   #$gene_scores = $gsb->score_genes;
 }
 
diff --git a/misc-scripts/id_mapping/run_all.pl b/misc-scripts/id_mapping/run_all.pl
index 5a0218b3e1..96791b2a07 100755
--- a/misc-scripts/id_mapping/run_all.pl
+++ b/misc-scripts/id_mapping/run_all.pl
@@ -92,6 +92,11 @@ $conf->parse_extra_options(qw(
   chromosomes|chr=s@
   region=s
   biotypes=s@
+  min_exon_length|minexonlength=i
+  exonerate_path|exoneratepath=s
+  exonerate_threshold|exoneratethreshold=i
+  exonerate_jobs|exoneratejobs=i
+  exonerate_bytes_per_job|exoneratebytesperjob=i
 ));
 $conf->allowed_params(
   $conf->get_common_params,
@@ -101,6 +106,8 @@ $conf->allowed_params(
     mode
     dumppath cachefile
     chromosomes region biotypes
+    min_exon_length
+    exonerate_path exonerate_threshold exonerate_jobs exonerate_byte_per_job
   )
 );
 
@@ -141,7 +148,8 @@ $options{'dump_cache'} = $conf->create_commandline_options(
         interactive => 0,
         is_component => 1,
     },
-    -EXCLUDE => [qw(mode)]
+    -EXCLUDE => [qw(mode min_exon_length exonerate_path exonerate_threshold
+                    exonerate_jobs exonerate_byte_per_job)]
 );
 
 $options{'id_mapping'} = $conf->create_commandline_options(
@@ -161,8 +169,9 @@ $options{'id_mapping'} = $conf->create_commandline_options(
 
 # run components, depending on mode
 my $mode = $conf->param('mode') || 'normal';
+my $sub = "run_$mode";
 no strict 'refs';
-&run_$mode;
+&$sub;
 
 
 # finish logfile
-- 
GitLab