diff --git a/misc-scripts/xref_mapping/exonerate_prototype.pl b/misc-scripts/xref_mapping/exonerate_prototype.pl index 1d850137c574320d798573d007431afc3f7532f2..a41e6e102c70cd91d7a8ca016932713ed238a63b 100644 --- a/misc-scripts/xref_mapping/exonerate_prototype.pl +++ b/misc-scripts/xref_mapping/exonerate_prototype.pl @@ -6,104 +6,70 @@ use IPC::Open3; # Use exonerate (or other program) to find xref-ensembl obejct mappings + # XXX my $queryfile = "xref_dna.fasta"; my $targetfile = "ensembl_transcripts.fasta"; -#run_exonerate($queryfile, $targetfile); -parse_and_store($targetfile); - - -=head2 run_exonerate - - Arg[1] : Query filename to pass to exonerate; this should be the XREF file. - Arg[2] : Target filename to pass to exonerate; this should be the ENSEMBL file. - Example : none - Description: submit mapping jobs to LSF via bsub. - Returntype : List of strings - Exceptions : none - Caller : general - -=cut +run_mapping($queryfile, $targetfile, "."); +store($targetfile); -sub run_exonerate { +sub run_mapping { - my ($query, $target) = @_; + my ($queryfile, $targetfile, $root_dir) = @_; - my $num_jobs = calculate_num_jobs($query); + # get list of methods + my @methods = ("ExonerateBasic", "ExonerateBest1"); # TODO get from Ian, maybe files as well - # TODO - get root_dir from config - my $root_dir = "/nfs/acari/gp1/work/ensembl/misc-scripts/xref_mapping"; + # foreach method, submit the appropriate job & keep track of the job name + my @job_names; - # TODO - get exonerate executable from config - my $exonerate_path = "/usr/local/ensembl/bin/exonerate-0.8.3"; + foreach my $method (@methods) { - # DNA xref - transcript mapping + my $obj_name = "XrefMapper::Methods::$method"; + # check that the appropriate object exists + eval "require $obj_name"; + if($@) { - my $dna_exonerate_options = join(" ", get_dna_exonerate_options()); + warn("Could not find object $obj_name corresponding to mapping method $method, skipping\n$@"); - my $unique_name = "mapXrefDNA" . time(); + } else { - my $prefix = $root_dir . "/" . basename($query); - $prefix =~ s/\.\w+$//; + my $obj = $obj_name->new(); + my $job_name = $obj->run($queryfile, $targetfile); + push @job_names, $job_name; + print "Submitted LSF job $job_name to list\n"; + sleep 1; # make sure unique names really are unique - my $output = "\$LSB_JOBINDEX.map"; - - my @dna_bsub = ( 'bsub', '-J', $unique_name . "[1-$num_jobs]", '-o', "$prefix.%I.out", '-e', "$prefix.%I.err"); - - #print "bsub command: " . join(" ", @dna_bsub) . "\n\n"; - - # Open a pipe to the stdin of a bsub command with the appropriate options - #open( BSUB, '|-' ) or exec (@dna_bsub); - - # Use IPC::Open3 to open the process so that we can read and write from/to its stdout/stderr - my ($wtr, $rtr, $etr, $pid); - $pid = open3($wtr, $rtr, $etr, @dna_bsub); - - # Create actual execute script to be executed with LSF, and write to pipe - my $dna_job = <<EOF; -. /usr/local/lsf/conf/profile.lsf - -cd /tmp + } -rm -f /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target /tmp/$output + } # foreach method -lsrcp ecs1a:$root_dir/$target /tmp/\$LSB_JOBINDEX.$target -lsrcp ecs1a:$root_dir/$query /tmp/\$LSB_JOBINDEX.$query + # submit depend job to wait for all mapping jobs + submit_depend_job($root_dir, @job_names); -$exonerate_path /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target --querychunkid \$LSB_JOBINDEX --querychunktotal $num_jobs --showvulgar false --showalignment FALSE --ryo "xrefdna:%qi:%ti:%qab:%qae:%tab:%tae:%C:%s\n" $dna_exonerate_options | grep '^xrefdna' > /tmp/$output -lsrcp /tmp/$output ecs1a:$root_dir/$output +} # run_exonerate -rm -f /tmp/\$LSB_JOBINDEX.$query /tmp/\$LSB_JOBINDEX.$target /tmp/$output -EOF - # TODO make sure query/target are the right way round +sub submit_depend_job { - print $wtr $dna_job; + my ($root_dir, @job_names) = @_; - #print "Job:\n" . $dna_job . "\n\n"; + # Submit a job that does nothing but wait on the main jobs to + # finish. This job is submitted interactively so the exec does not + # return until everything is finished. - close($wtr); + # build up the bsub command; first part + my @depend_bsub = ('bsub', '-K'); - # Wait until bsub has actually run - will print to its stdout ($rtr) and then close it - my $jobid; - while (<$rtr>) { - if (/Job <([0-9]+)> is/) { - $jobid = $1; - print "LSF job ID for main mapping job: $jobid \n" - } - } - if (!defined($jobid)) { - print STDERR "Error: could not get LSF job ID for mapping job\n"; + # one -wended clause for each main job + foreach my $job (@job_names) { + push @depend_bsub, "-wended($job)"; } - # TODO same for protein - - # The above calls to bsub return immediately. We now submit a job that does - # nothing but wait on them to finish. This job is submitted interactively - # so the exec does not return until everything is finished. - my @depend_bsub = ('bsub', '-K', "-wended($unique_name)", '-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true'); + # rest of command + push @depend_bsub, ('-q', 'small', '-o', "$root_dir/depend.out", '-e', "$root_dir/depend.err", '/bin/true'); #print "depend bsub:\n" . join (" ", @depend_bsub) . "\n"; @@ -112,8 +78,8 @@ EOF my $depend_jobid; while (<$depend_rtr>) { if (/Job <([0-9]+)> is/) { - $jobid = $1; - print "LSF job ID for depend job: $jobid \n" ; + $depend_jobid = $1; + print "LSF job ID for depend job: $depend_jobid \n" ; } } if (!defined($depend_jobid)) { @@ -121,12 +87,24 @@ EOF } -} # run_exonerate +} + +=head2 store + + Arg[1] : The target file used in the exonerate run. Used to work out the Ensembl object type. + Arg[2] : + Example : none + Description: Parse exonerate output files and build files for loading into target db tables. + Returntype : List of strings + Exceptions : none + Caller : general + +=cut -sub parse_and_store { +sub store { - my $target_file_name = shift; + my ($target_file_name) = @_; my $type = get_ensembl_object_type($target_file_name); @@ -139,17 +117,43 @@ sub parse_and_store { "ensembl", {'RaiseError' => 1}) || die "Can't connect to database"; - my $ox_sth = $dbi->prepare("INSERT INTO object_xref(ensembl_id, ensembl_object_type, xref_id) VALUES(?,?,?)"); + # get current highest internal ID from xref and object_xref + my $max_xref_id = 0; + my $sth = $dbi->prepare("SELECT MAX(xref_id) FROM xref"); + $sth->execute(); + my $max_xref_id = ($sth->fetchrow_array())[0]; + if (!defined $max_xref_id) { + print "Can't get highest existing xref_id, using 0\n)"; + } else { + print "Maximum existing xref_id = $max_xref_id\n"; + } + my $max_object_xref_id = 0; + $sth = $dbi->prepare("SELECT MAX(object_xref_id) FROM object_xref"); + $sth->execute(); + my $max_object_xref_id = ($sth->fetchrow_array())[0]; + if (!defined $max_object_xref_id) { + print "Can't get highest existing object_xref_id, using 1\n)"; + } else { + print "Maximum existing object_xref_id = $max_object_xref_id\n"; + } + + + #my $ox_sth = $dbi->prepare("INSERT INTO object_xref(ensembl_id, ensembl_object_type, xref_id) VALUES(?,?,?)"); - my $ix_sth = $dbi->prepare("INSERT INTO identity_xref VALUES(?,?,?,?,?,?,?,?,?,?,?)"); + #my $ix_sth = $dbi->prepare("INSERT INTO identity_xref VALUES(?,?,?,?,?,?,?,?,?,?,?)"); # files to write table data to - open (OBJECT_XREF, ">object_xref."); + open (OBJECT_XREF, ">object_xref.txt"); open (IDENTITY_XREF, ">identity_xref.txt"); my $total_lines = 0; my $total_files = 0; + my $xref_id = $max_xref_id + 1; + my $object_xref_id = $max_object_xref_id + 1; + + # TODO - store xrefs in a file as well + foreach my $file (glob("*.map")) { print "Parsing results from $file \n"; @@ -165,17 +169,18 @@ sub parse_and_store { # TODO make sure query & target are the right way around - print OBJECT_XREF "$target_id\t$type\t$query_id\n"; - print IDENTITY_XREF "$query_id\t$target_id\t$query_start\t$query_end\t$target_start\t$target_end\t$cigar_line\t$score\t\\N\t$analysis_id\n"; + print OBJECT_XREF "$object_xref_id\t$target_id\t$type\t$query_id\n"; + print IDENTITY_XREF "$object_xref_id\t$query_id\t$target_id\t$query_start\t$query_end\t$target_start\t$target_end\t$cigar_line\t$score\t\\N\t$analysis_id\n"; # TODO - evalue? + $object_xref_id++; # Store in database # create entry in object_xref and get its object_xref_id - $ox_sth->execute($target_id, $type, $query_id) || warn "Error writing to object_xref table"; - my $object_xref_id = $ox_sth->{'mysql_insertid'}; + #$ox_sth->execute($target_id, $type, $query_id) || warn "Error writing to object_xref table"; + #my $object_xref_id = $ox_sth->{'mysql_insertid'}; # create entry in identity_xref - $ix_sth->execute($object_xref_id, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score, undef, $analysis_id) || warn "Error writing to identity_xref table"; + #$ix_sth->execute($object_xref_id, $query_id, $target_id, $query_start, $query_end, $target_start, $target_end, $cigar_line, $score, undef, $analysis_id) || warn "Error writing to identity_xref table"; } @@ -190,53 +195,6 @@ sub parse_and_store { } -=head2 get_protein_exonerate_options - - Args : none - Example : none - Description: Return additional options to pass to exonerate when running protein mapping - Returntype : List of strings - Exceptions : none - Caller : general - -=cut - -sub get_protein_exonerate_options { - - return(); - -} - -=head2 get_dna_exonerate_options - - Args : none - Example : none - Description: Return additional options to pass to exonerate when running DNA mapping - Returntype : List of strings - Exceptions : none - Caller : general - -=cut - -sub get_dna_exonerate_options { - - return("--bestn", "1"); - -} - - -sub calculate_num_jobs { - - my $query = shift; - - my $bytes_per_job = 250000; - - my $size = (stat $query)[7]; - - return int($size/$bytes_per_job); - -} - sub get_ensembl_object_type {