From f4d79bf4aa57011278ab22d1b863b8206b9d2afd Mon Sep 17 00:00:00 2001 From: Glenn Proctor <gp1@sanger.ac.uk> Date: Mon, 4 Oct 2004 10:31:13 +0000 Subject: [PATCH] More implementation: now successfully submits exonerate jobs and dependency job. Improved process communication. Implemented parsing. --- .../xref_mapping/exonerate_prototype.pl | 117 ++++++++++++++---- 1 file changed, 94 insertions(+), 23 deletions(-) diff --git a/misc-scripts/xref_mapping/exonerate_prototype.pl b/misc-scripts/xref_mapping/exonerate_prototype.pl index 6c2ea4edfe..5c69757c42 100644 --- a/misc-scripts/xref_mapping/exonerate_prototype.pl +++ b/misc-scripts/xref_mapping/exonerate_prototype.pl @@ -2,17 +2,17 @@ use strict; use DBI; use File::Basename; +use IPC::Open3; # Use exonerate (or other program) to find xref-ensembl obejct mappings # XXX -run_matching(); +my $queryfile = "xref_dna.fasta"; +my $targetfile = "ensembl_transcripts.fasta"; -sub run_matching { +#run_exonerate($queryfile, $targetfile); +parse_and_store($targetfile); - run_exonerate("queryfile.fasta", "targetfile.fasta"); - -} =head2 run_exonerate @@ -30,11 +30,10 @@ sub run_exonerate { my ($query, $target) = @_; - # TODO - calculate number of jobs from file sizes - my $num_jobs = 20; + my $num_jobs = calculate_num_jobs($query); # TODO - get root_dir from config - my $root_dir = "."; + my $root_dir = "/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping"; # TODO - get exonerate executable from config my $exonerate_path = "/usr/local/ensembl/bin/exonerate-0.8.3"; @@ -45,19 +44,22 @@ sub run_exonerate { my $unique_name = "mapXrefDNA" . time(); - # TODO - prefix = working dir + basename of query file - my $prefix = basename($query); + my $prefix = $root_dir . "/" . basename($query); $prefix =~ s/\.\w+$//; my $output = "\$LSB_JOBINDEX.map"; my @dna_bsub = ( 'bsub', '-J', $unique_name . "[1-$num_jobs]", '-o', "$prefix.%I.out", '-e', "$prefix.%I.err"); - print "bsub command: " . join(" ", @dna_bsub) . "\n\n"; + #print "bsub command: " . join(" ", @dna_bsub) . "\n\n"; # Open a pipe to the stdin of a bsub command with the appropriate options #open( BSUB, '|-' ) or exec (@dna_bsub); + # Use IPC::Open3 to open the process so that we can read and write from/to its stdout/stderr + my ($wtr, $rtr, $etr, $pid); + $pid = open3($wtr, $rtr, $etr, @dna_bsub); + # Create actual execute script to be executed with LSF, and write to pipe my $dna_job = <<EOF; . /usr/local/lsf/conf/profile.lsf @@ -69,7 +71,7 @@ rm -f /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target /tmp/$output lsrcp ecs1a:$root_dir/$target /tmp/\$LSB_JOBINDEX.$target lsrcp ecs1a:$root_dir/$query /tmp/\$LSB_JOBINDEX.$query -$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --model affine:local -M 900 --showalignment FALSE --subopt no --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output +$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --showvulgar false --showalignment FALSE --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output lsrcp /tmp/$output ecs1a:$root_dir/$output @@ -79,11 +81,23 @@ EOF # TODO make sure query/target are the right way round # TODO analysis ID - #print BSUB $dna_job; + print $wtr $dna_job; - print "job:\n" . $dna_job . "\n\n"; + #print "Job:\n" . $dna_job . "\n\n"; - #close(BSUB); + close($wtr); + + # Wait until bsub has actually run - will print to its stdout ($rtr) and then close it + my $jobid; + while (<$rtr>) { + if (/Job <([0-9]+)> is/) { + $jobid = $1; + print "LSF job ID for main mapping job: $jobid \n" + } + } + if (!defined($jobid)) { + print STDERR "Error: could not get LSF job ID for mapping job\n"; + } # TODO same for protein @@ -92,26 +106,42 @@ EOF # so the exec does not return until everything is finished. my @depend_bsub = ('bsub', '-K', "-wended($unique_name)", '-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true'); - print "depend bsub:\n" . join (" ", @depend_bsub) . "\n"; + #print "depend bsub:\n" . join (" ", @depend_bsub) . "\n"; + + my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid); + $depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub); + my $depend_jobid; + while (<$depend_rtr>) { + if (/Job <([0-9]+)> is/) { + $jobid = $1; + print "LSF job ID for depend job: $jobid \n" ; + } + } + if (!defined($depend_jobid)) { + print STDERR "Error: could not get depend job ID\n"; + } - # TODO - system or exec this } # run_exonerate sub parse_and_store { + my $target_file_name = shift; + my $type = get_ensembl_object_type($target_file_name); + # files to write table data to open (OBJECT_XREF, ">object_xref.txt"); open (IDENTITY_XREF, ">identity_xref.txt"); my $total_lines = 0; - foreach my $file (glob <*.map>) { + my $total_files = 0; + + foreach my $file (glob("*.map")) { - # XXX figure out type of ensembl objects we're dealing with - $type = 'Transcript'; print "Parsing results from $file \n"; - open(FILE, file); + open(FILE, $file); + $total_files++; while (<FILE>) { @@ -120,7 +150,7 @@ sub parse_and_store { # TODO make sure query & target are the right way around print OBJECT_XREF "$target_id $type $query_id\n"; # XXX object_xref_id - print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score"; # XXX object_xref_id + print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score\n"; # XXX object_xref_id } @@ -131,6 +161,8 @@ sub parse_and_store { close(IDENTITY_XREF); close(OBJECT_XREF); + print "Read $total_lines lines from $total_files exonerate output files\n"; + } =head2 get_protein_exonerate_options @@ -163,6 +195,45 @@ sub get_protein_exonerate_options { sub get_dna_exonerate_options { - return('-q', 'normal'); + return("--bestn", "1"); + +} + + +sub calculate_num_jobs { + + my $query = shift; + + my $bytes_per_job = 250000; + + my $size = (stat $query)[7]; + + return int($size/$bytes_per_job); + +} + +sub get_ensembl_object_type { + + my $filename = shift; + my $type; + + if ($filename =~ /gene/i) { + + $type = "Gene"; + + } elsif ($filename =~ /transcript/i) { + + $type = "Transcript"; + + } elsif ($filename =~ /translation/i) { + + $type = "Translation"; + + } else { + + print STDERR "Cannot deduce Ensembl object type from filename $filename"; + } + + return $type; } -- GitLab