More implementation: now successfully submits exonerate jobs and dependency job.

Improved process communication. Implemented parsing.

More implementation: now successfully submits exonerate jobs and dependency job.
Improved process communication. Implemented parsing.
f4d79bf4 · Glenn Proctor · 9aacf50b · f4d79bf4
Commit f4d79bf4 authored 20 years ago by Glenn Proctor
--- a/misc-scripts/xref_mapping/exonerate_prototype.pl
+++ b/misc-scripts/xref_mapping/exonerate_prototype.pl
@@ -2,17 +2,17 @@ use strict;

 use DBI;
 use File::Basename;
+use IPC::Open3;

 # Use exonerate (or other program) to find xref-ensembl obejct mappings

 # XXX
-run_matching();
+my $queryfile = "xref_dna.fasta";
+my $targetfile = "ensembl_transcripts.fasta";

-sub run_matching {
+#run_exonerate($queryfile, $targetfile);
+parse_and_store($targetfile);

-   run_exonerate("queryfile.fasta", "targetfile.fasta");
-
-}

 =head2 run_exonerate

@@ -30,11 +30,10 @@ sub run_exonerate {

  my ($query, $target) = @_;

-  # TODO - calculate number of jobs from file sizes
-  my $num_jobs = 20;
+  my $num_jobs = calculate_num_jobs($query);

  # TODO - get root_dir from config
-  my $root_dir = ".";
+  my $root_dir = "/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping";

  # TODO - get exonerate executable from config
  my $exonerate_path = "/usr/local/ensembl/bin/exonerate-0.8.3";
@@ -45,19 +44,22 @@ sub run_exonerate {

  my $unique_name = "mapXrefDNA" . time();

-  # TODO - prefix = working dir + basename of query file
-  my $prefix = basename($query);
+  my $prefix = $root_dir . "/" . basename($query);
  $prefix =~ s/\.\w+$//;

  my $output = "\$LSB_JOBINDEX.map";

  my @dna_bsub = ( 'bsub', '-J', $unique_name . "[1-$num_jobs]", '-o', "$prefix.%I.out", '-e', "$prefix.%I.err");

-  print "bsub command: " . join(" ", @dna_bsub) . "\n\n";
+  #print "bsub command: " . join(" ", @dna_bsub) . "\n\n";

  # Open a pipe to the stdin of a bsub command with the appropriate options
  #open( BSUB, '|-' ) or exec (@dna_bsub);

+  # Use IPC::Open3 to open the process so that we can read and write from/to its stdout/stderr
+  my ($wtr, $rtr, $etr, $pid);
+  $pid = open3($wtr, $rtr, $etr, @dna_bsub);
+
  # Create actual execute script to be executed with LSF, and write to pipe
  my $dna_job = <<EOF;
 . /usr/local/lsf/conf/profile.lsf
@@ -69,7 +71,7 @@ rm -f /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target /tmp/$output
 lsrcp ecs1a:$root_dir/$target /tmp/\$LSB_JOBINDEX.$target
 lsrcp ecs1a:$root_dir/$query  /tmp/\$LSB_JOBINDEX.$query

-$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --model affine:local -M 900 --showalignment FALSE --subopt no --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output
+$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --showvulgar false --showalignment FALSE --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output

 lsrcp /tmp/$output ecs1a:$root_dir/$output

@@ -79,11 +81,23 @@ EOF
 # TODO make sure query/target are the right way round
 # TODO analysis ID

-  #print BSUB $dna_job;
+  print $wtr $dna_job;

-  print "job:\n" . $dna_job . "\n\n";
+  #print "Job:\n" . $dna_job . "\n\n";

-  #close(BSUB);
+  close($wtr);
+
+  # Wait until bsub has actually run - will print to its stdout ($rtr) and then close it
+  my $jobid;
+  while (<$rtr>) {
+    if (/Job <([0-9]+)> is/) {
+      $jobid = $1;
+      print "LSF job ID for main mapping job: $jobid \n"
+    }
+  }
+  if (!defined($jobid)) {
+    print STDERR "Error: could not get LSF job ID for mapping job\n";
+  }

  # TODO same for protein

@@ -92,26 +106,42 @@ EOF
  # so the exec does not return until everything is finished.
  my @depend_bsub = ('bsub', '-K', "-wended($unique_name)", '-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true');

-  print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";
+  #print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";
+
+  my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid);
+  $depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub);
+  my $depend_jobid;
+  while (<$depend_rtr>) {
+    if (/Job <([0-9]+)> is/) {
+      $jobid = $1;
+      print "LSF job ID for depend job: $jobid \n" ;
+    }
+  }
+  if (!defined($depend_jobid)) {
+    print STDERR "Error: could not get depend job ID\n";
+  }

-  # TODO - system or exec this

 } # run_exonerate


 sub parse_and_store {

+  my $target_file_name = shift;
+  my $type = get_ensembl_object_type($target_file_name);
+
  # files to write table data to
  open (OBJECT_XREF, ">object_xref.txt");
  open (IDENTITY_XREF, ">identity_xref.txt");

  my $total_lines = 0;
-  foreach my $file (glob <*.map>) {
+  my $total_files = 0;
+
+  foreach my $file (glob("*.map")) {

-    # XXX figure out type of ensembl objects we're dealing with
-    $type = 'Transcript';
    print "Parsing results from $file \n";
-    open(FILE, file);
+    open(FILE, $file);
+    $total_files++;

    while (<FILE>) {

@@ -120,7 +150,7 @@ sub parse_and_store {
      # TODO make sure query & target are the right way around

      print OBJECT_XREF "$target_id $type $query_id\n"; # XXX object_xref_id
-      print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score"; # XXX object_xref_id
+      print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score\n"; # XXX object_xref_id

    }

@@ -131,6 +161,8 @@ sub parse_and_store {
  close(IDENTITY_XREF);
  close(OBJECT_XREF);

+  print "Read $total_lines lines from $total_files exonerate output files\n";
+
 }

 =head2 get_protein_exonerate_options
@@ -163,6 +195,45 @@ sub get_protein_exonerate_options {

 sub get_dna_exonerate_options {

-   return('-q', 'normal');
+   return("--bestn", "1");
+
+}
+
+
+sub calculate_num_jobs {
+
+  my $query = shift;
+
+  my $bytes_per_job = 250000;
+
+  my $size = (stat $query)[7];
+
+  return int($size/$bytes_per_job);
+
+}
+
+sub get_ensembl_object_type {
+
+  my $filename = shift;
+  my $type;
+
+  if ($filename =~ /gene/i) {
+
+    $type = "Gene";
+
+  } elsif ($filename =~ /transcript/i) {
+
+    $type = "Transcript";
+
+  } elsif ($filename =~ /translation/i) {
+
+    $type = "Translation";
+
+  } else {
+
+    print STDERR "Cannot deduce Ensembl object type from filename $filename";
+  }
+
+  return $type;

 }