From f4d79bf4aa57011278ab22d1b863b8206b9d2afd Mon Sep 17 00:00:00 2001
From: Glenn Proctor <gp1@sanger.ac.uk>
Date: Mon, 4 Oct 2004 10:31:13 +0000
Subject: [PATCH] More implementation: now successfully submits exonerate jobs
 and dependency job.

Improved process communication. Implemented parsing.
---
 .../xref_mapping/exonerate_prototype.pl       | 117 ++++++++++++++----
 1 file changed, 94 insertions(+), 23 deletions(-)

diff --git a/misc-scripts/xref_mapping/exonerate_prototype.pl b/misc-scripts/xref_mapping/exonerate_prototype.pl
index 6c2ea4edfe..5c69757c42 100644
--- a/misc-scripts/xref_mapping/exonerate_prototype.pl
+++ b/misc-scripts/xref_mapping/exonerate_prototype.pl
@@ -2,17 +2,17 @@ use strict;
 
 use DBI;
 use File::Basename;
+use IPC::Open3;
 
 # Use exonerate (or other program) to find xref-ensembl obejct mappings
 
 # XXX
-run_matching();
+my $queryfile = "xref_dna.fasta";
+my $targetfile = "ensembl_transcripts.fasta";
 
-sub run_matching {
+#run_exonerate($queryfile, $targetfile);
+parse_and_store($targetfile);
 
-   run_exonerate("queryfile.fasta", "targetfile.fasta");
-
-}
 
 =head2 run_exonerate
 
@@ -30,11 +30,10 @@ sub run_exonerate {
 
   my ($query, $target) = @_;
 
-  # TODO - calculate number of jobs from file sizes
-  my $num_jobs = 20;
+  my $num_jobs = calculate_num_jobs($query);
 
   # TODO - get root_dir from config
-  my $root_dir = ".";
+  my $root_dir = "/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping";
 
   # TODO - get exonerate executable from config
   my $exonerate_path = "/usr/local/ensembl/bin/exonerate-0.8.3";
@@ -45,19 +44,22 @@ sub run_exonerate {
 
   my $unique_name = "mapXrefDNA" . time();
 
-  # TODO - prefix = working dir + basename of query file
-  my $prefix = basename($query);
+  my $prefix = $root_dir . "/" . basename($query);
   $prefix =~ s/\.\w+$//;
 
   my $output = "\$LSB_JOBINDEX.map";
 
   my @dna_bsub = ( 'bsub', '-J', $unique_name . "[1-$num_jobs]", '-o', "$prefix.%I.out", '-e', "$prefix.%I.err");
 
-  print "bsub command: " . join(" ", @dna_bsub) . "\n\n";
+  #print "bsub command: " . join(" ", @dna_bsub) . "\n\n";
 
   # Open a pipe to the stdin of a bsub command with the appropriate options
   #open( BSUB, '|-' ) or exec (@dna_bsub);
 
+  # Use IPC::Open3 to open the process so that we can read and write from/to its stdout/stderr
+  my ($wtr, $rtr, $etr, $pid);
+  $pid = open3($wtr, $rtr, $etr, @dna_bsub);
+
   # Create actual execute script to be executed with LSF, and write to pipe
   my $dna_job = <<EOF;
 . /usr/local/lsf/conf/profile.lsf
@@ -69,7 +71,7 @@ rm -f /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target /tmp/$output
 lsrcp ecs1a:$root_dir/$target /tmp/\$LSB_JOBINDEX.$target
 lsrcp ecs1a:$root_dir/$query  /tmp/\$LSB_JOBINDEX.$query
 
-$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --model affine:local -M 900 --showalignment FALSE --subopt no --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output
+$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --showvulgar false --showalignment FALSE --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output
 
 lsrcp /tmp/$output ecs1a:$root_dir/$output
 
@@ -79,11 +81,23 @@ EOF
 # TODO make sure query/target are the right way round
 # TODO analysis ID
 
-  #print BSUB $dna_job;
+  print $wtr $dna_job;
 
-  print "job:\n" . $dna_job . "\n\n";
+  #print "Job:\n" . $dna_job . "\n\n";
 
-  #close(BSUB);
+  close($wtr);
+
+  # Wait until bsub has actually run - will print to its stdout ($rtr) and then close it
+  my $jobid;
+  while (<$rtr>) {
+    if (/Job <([0-9]+)> is/) {
+      $jobid = $1;
+      print "LSF job ID for main mapping job: $jobid \n"
+    }
+  }
+  if (!defined($jobid)) {
+    print STDERR "Error: could not get LSF job ID for mapping job\n";
+  }
 
   # TODO same for protein
 
@@ -92,26 +106,42 @@ EOF
   # so the exec does not return until everything is finished.
   my @depend_bsub = ('bsub', '-K', "-wended($unique_name)", '-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true');
 
-  print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";
+  #print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";
+
+  my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid);
+  $depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub);
+  my $depend_jobid;
+  while (<$depend_rtr>) {
+    if (/Job <([0-9]+)> is/) {
+      $jobid = $1;
+      print "LSF job ID for depend job: $jobid \n" ;
+    }
+  }
+  if (!defined($depend_jobid)) {
+    print STDERR "Error: could not get depend job ID\n";
+  }
 
-  # TODO - system or exec this
 
 } # run_exonerate
 
 
 sub parse_and_store {
 
+  my $target_file_name = shift;
+  my $type = get_ensembl_object_type($target_file_name);
+
   # files to write table data to
   open (OBJECT_XREF, ">object_xref.txt");
   open (IDENTITY_XREF, ">identity_xref.txt");
 
   my $total_lines = 0;
-  foreach my $file (glob <*.map>) {
+  my $total_files = 0;
+
+  foreach my $file (glob("*.map")) {
 
-    # XXX figure out type of ensembl objects we're dealing with
-    $type = 'Transcript';
     print "Parsing results from $file \n";
-    open(FILE, file);
+    open(FILE, $file);
+    $total_files++;
 
     while (<FILE>) {
 
@@ -120,7 +150,7 @@ sub parse_and_store {
       # TODO make sure query & target are the right way around
 
       print OBJECT_XREF "$target_id $type $query_id\n"; # XXX object_xref_id
-      print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score"; # XXX object_xref_id
+      print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score\n"; # XXX object_xref_id
 
     }
 
@@ -131,6 +161,8 @@ sub parse_and_store {
   close(IDENTITY_XREF);
   close(OBJECT_XREF);
 
+  print "Read $total_lines lines from $total_files exonerate output files\n";
+
 }
 
 =head2 get_protein_exonerate_options
@@ -163,6 +195,45 @@ sub get_protein_exonerate_options {
 
 sub get_dna_exonerate_options {
 
-   return('-q', 'normal');
+   return("--bestn", "1");
+
+}
+
+
+sub calculate_num_jobs {
+
+  my $query = shift;
+
+  my $bytes_per_job = 250000;
+
+  my $size = (stat $query)[7];
+
+  return int($size/$bytes_per_job);
+
+}
+
+sub get_ensembl_object_type {
+
+  my $filename = shift;
+  my $type;
+
+  if ($filename =~ /gene/i) {
+
+    $type = "Gene";
+
+  } elsif ($filename =~ /transcript/i) {
+
+    $type = "Transcript";
+
+  } elsif ($filename =~ /translation/i) {
+
+    $type = "Translation";
+
+  } else {
+
+    print STDERR "Cannot deduce Ensembl object type from filename $filename";
+  }
+
+  return $type;
 
 }
-- 
GitLab