Skip to content
Snippets Groups Projects
Commit f4d79bf4 authored by Glenn Proctor's avatar Glenn Proctor
Browse files

More implementation: now successfully submits exonerate jobs and dependency job.

Improved process communication. Implemented parsing.
parent 9aacf50b
No related branches found
No related tags found
No related merge requests found
......@@ -2,17 +2,17 @@ use strict;
use DBI;
use File::Basename;
use IPC::Open3;
# Use exonerate (or other program) to find xref-ensembl obejct mappings
# XXX
run_matching();
my $queryfile = "xref_dna.fasta";
my $targetfile = "ensembl_transcripts.fasta";
sub run_matching {
#run_exonerate($queryfile, $targetfile);
parse_and_store($targetfile);
run_exonerate("queryfile.fasta", "targetfile.fasta");
}
=head2 run_exonerate
......@@ -30,11 +30,10 @@ sub run_exonerate {
my ($query, $target) = @_;
# TODO - calculate number of jobs from file sizes
my $num_jobs = 20;
my $num_jobs = calculate_num_jobs($query);
# TODO - get root_dir from config
my $root_dir = ".";
my $root_dir = "/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping";
# TODO - get exonerate executable from config
my $exonerate_path = "/usr/local/ensembl/bin/exonerate-0.8.3";
......@@ -45,19 +44,22 @@ sub run_exonerate {
my $unique_name = "mapXrefDNA" . time();
# TODO - prefix = working dir + basename of query file
my $prefix = basename($query);
my $prefix = $root_dir . "/" . basename($query);
$prefix =~ s/\.\w+$//;
my $output = "\$LSB_JOBINDEX.map";
my @dna_bsub = ( 'bsub', '-J', $unique_name . "[1-$num_jobs]", '-o', "$prefix.%I.out", '-e', "$prefix.%I.err");
print "bsub command: " . join(" ", @dna_bsub) . "\n\n";
#print "bsub command: " . join(" ", @dna_bsub) . "\n\n";
# Open a pipe to the stdin of a bsub command with the appropriate options
#open( BSUB, '|-' ) or exec (@dna_bsub);
# Use IPC::Open3 to open the process so that we can read and write from/to its stdout/stderr
my ($wtr, $rtr, $etr, $pid);
$pid = open3($wtr, $rtr, $etr, @dna_bsub);
# Create actual execute script to be executed with LSF, and write to pipe
my $dna_job = <<EOF;
. /usr/local/lsf/conf/profile.lsf
......@@ -69,7 +71,7 @@ rm -f /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target /tmp/$output
lsrcp ecs1a:$root_dir/$target /tmp/\$LSB_JOBINDEX.$target
lsrcp ecs1a:$root_dir/$query /tmp/\$LSB_JOBINDEX.$query
$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --model affine:local -M 900 --showalignment FALSE --subopt no --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output
$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --showvulgar false --showalignment FALSE --ryo "xrefdna %qi %ti %qab %qae %tab %tae %C %s\n" $dna_exonerate_options | grep '^xrefdna ' > /tmp/$output
lsrcp /tmp/$output ecs1a:$root_dir/$output
......@@ -79,11 +81,23 @@ EOF
# TODO make sure query/target are the right way round
# TODO analysis ID
#print BSUB $dna_job;
print $wtr $dna_job;
print "job:\n" . $dna_job . "\n\n";
#print "Job:\n" . $dna_job . "\n\n";
#close(BSUB);
close($wtr);
# Wait until bsub has actually run - will print to its stdout ($rtr) and then close it
my $jobid;
while (<$rtr>) {
if (/Job <([0-9]+)> is/) {
$jobid = $1;
print "LSF job ID for main mapping job: $jobid \n"
}
}
if (!defined($jobid)) {
print STDERR "Error: could not get LSF job ID for mapping job\n";
}
# TODO same for protein
......@@ -92,26 +106,42 @@ EOF
# so the exec does not return until everything is finished.
my @depend_bsub = ('bsub', '-K', "-wended($unique_name)", '-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true');
print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";
#print "depend bsub:\n" . join (" ", @depend_bsub) . "\n";
my ($depend_wtr, $depend_rtr, $depend_etr, $depend_pid);
$depend_pid = open3($depend_wtr, $depend_rtr, $depend_etr, @depend_bsub);
my $depend_jobid;
while (<$depend_rtr>) {
if (/Job <([0-9]+)> is/) {
$jobid = $1;
print "LSF job ID for depend job: $jobid \n" ;
}
}
if (!defined($depend_jobid)) {
print STDERR "Error: could not get depend job ID\n";
}
# TODO - system or exec this
} # run_exonerate
sub parse_and_store {
my $target_file_name = shift;
my $type = get_ensembl_object_type($target_file_name);
# files to write table data to
open (OBJECT_XREF, ">object_xref.txt");
open (IDENTITY_XREF, ">identity_xref.txt");
my $total_lines = 0;
foreach my $file (glob <*.map>) {
my $total_files = 0;
foreach my $file (glob("*.map")) {
# XXX figure out type of ensembl objects we're dealing with
$type = 'Transcript';
print "Parsing results from $file \n";
open(FILE, file);
open(FILE, $file);
$total_files++;
while (<FILE>) {
......@@ -120,7 +150,7 @@ sub parse_and_store {
# TODO make sure query & target are the right way around
print OBJECT_XREF "$target_id $type $query_id\n"; # XXX object_xref_id
print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score"; # XXX object_xref_id
print IDENTITY_XREF "$query_id $target_id $query_start $query_end $target_start $target_end $cigar_line $score\n"; # XXX object_xref_id
}
......@@ -131,6 +161,8 @@ sub parse_and_store {
close(IDENTITY_XREF);
close(OBJECT_XREF);
print "Read $total_lines lines from $total_files exonerate output files\n";
}
=head2 get_protein_exonerate_options
......@@ -163,6 +195,45 @@ sub get_protein_exonerate_options {
sub get_dna_exonerate_options {
return('-q', 'normal');
return("--bestn", "1");
}
sub calculate_num_jobs {
my $query = shift;
my $bytes_per_job = 250000;
my $size = (stat $query)[7];
return int($size/$bytes_per_job);
}
sub get_ensembl_object_type {
my $filename = shift;
my $type;
if ($filename =~ /gene/i) {
$type = "Gene";
} elsif ($filename =~ /transcript/i) {
$type = "Transcript";
} elsif ($filename =~ /translation/i) {
$type = "Translation";
} else {
print STDERR "Cannot deduce Ensembl object type from filename $filename";
}
return $type;
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment