Commit a5a2b843 authored by Leo Gordon's avatar Leo Gordon
Browse files

allow dataflow from standaloneJob

parent 5778d580
......@@ -25,6 +25,8 @@ use strict;
use base ('Bio::EnsEMBL::DBSQL::AnalysisAdaptor');
use Bio::EnsEMBL::Hive::URLFactory;
=head2 fetch_by_logic_name_or_url
......@@ -33,11 +35,11 @@ use base ('Bio::EnsEMBL::DBSQL::AnalysisAdaptor');
=cut
sub fetch_by_logic_name_or_url {
my $self = shift @_;
my $self = shift @_; # can either be $self or class name
my $logic_name_or_url = shift @_;
if($logic_name_or_url =~ m{^\w*://}) {
return Bio::EnsEMBL::Hive::URLFactory->fetch($logic_name_or_url, $self->db);
return Bio::EnsEMBL::Hive::URLFactory->fetch($logic_name_or_url, ref($self) && $self->db);
} else {
return $self->fetch_by_logic_name($logic_name_or_url);
}
......
......@@ -51,6 +51,7 @@ package Bio::EnsEMBL::Hive::DataflowRule;
use strict;
use Bio::EnsEMBL::Utils::Argument; # import 'rearrange()'
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Hive::DBSQL::AnalysisAdaptor;
=head2 new
......@@ -239,9 +240,9 @@ sub to_analysis {
#if the 'from' and 'to' share the same adaptor, then use a simple logic_name
#for the URL rather than a full network distributed URL
my $ref_rule_adaptor = $self->from_analysis->adaptor;
my $ref_rule_adaptor = $self->from_analysis && $self->from_analysis->adaptor;
if($analysis_or_nt->can('logic_name') and $self->from_analysis and ($ref_rule_adaptor == $analysis_or_nt->adaptor)) {
if($analysis_or_nt->can('logic_name') and $ref_rule_adaptor and ($ref_rule_adaptor == $analysis_or_nt->adaptor)) {
$self->{'_to_analysis_url'} = $analysis_or_nt->logic_name;
} else {
$self->{'_to_analysis_url'} = $analysis_or_nt->url($ref_rule_adaptor->db);
......@@ -250,8 +251,12 @@ sub to_analysis {
# lazy load the analysis object if I can
if(!defined($self->{'_to_analysis'}) and defined($self->to_analysis_url)) {
$self->{'_to_analysis'} = $self->adaptor->db->get_AnalysisAdaptor->fetch_by_logic_name_or_url($self->to_analysis_url)
or die "Cannot fetch analysis from logic_name or url '".$self->to_analysis_url."' for dataflow rule with id='".$self->dbID."'\n";
my $url = $self->to_analysis_url;
$self->{'_to_analysis'} = $self->adaptor
? $self->adaptor->db->get_AnalysisAdaptor->fetch_by_logic_name_or_url($url)
: Bio::EnsEMBL::Hive::DBSQL::AnalysisAdaptor->fetch_by_logic_name_or_url($url)
or die "Cannot fetch analysis from logic_name or url '$url' for dataflow rule with id='".$self->dbID."'\n";
}
return $self->{'_to_analysis'};
......
......@@ -6,9 +6,11 @@ use Getopt::Long qw(:config pass_through);
use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Hive::Process;
use Bio::EnsEMBL::Hive::AnalysisJob;
use Bio::EnsEMBL::Hive::Utils ('script_usage', 'load_file_or_module', 'parse_cmdline_options', 'stringify');
use Bio::EnsEMBL::Hive::Utils ('script_usage', 'load_file_or_module', 'parse_cmdline_options', 'stringify', 'destringify');
my ($reg_conf, $help, $debug, $no_write);
use Data::Dumper;
my ($reg_conf, $help, $debug, $no_write, $flow_into);
my $module_or_file = shift @ARGV or script_usage();
......@@ -17,6 +19,7 @@ GetOptions(
'debug=i' => \$debug,
'reg_conf|regfile=s' => \$reg_conf,
'no_write|nowrite' => \$no_write,
'flow_into|flow=s' => \$flow_into,
);
if ($help or !$module_or_file) {
......@@ -33,7 +36,18 @@ my $process = $runnable_module->new();
my $job = Bio::EnsEMBL::Hive::AnalysisJob->new();
my ($param_hash, $param_list) = parse_cmdline_options();
$job->param_init( 1, $process->param_defaults(), $param_hash );
$job->dataflow_rules( 1, [] ); # dataflow switched off by default
$flow_into = $flow_into ? destringify($flow_into) : []; # empty dataflow for branch 1 by default
$flow_into = { 1 => $flow_into } unless(ref($flow_into) eq 'HASH'); # force non-hash into a hash
foreach my $branch_code (keys %$flow_into) {
my $heirs = $flow_into->{$branch_code};
$heirs = [ $heirs ] unless(ref($heirs)); # force scalar into an arrayref first
my @dataflow_rules = map { Bio::EnsEMBL::Hive::DataflowRule->new( -to_analysis_url => $_ ) } @$heirs;
$job->dataflow_rules( $branch_code, \@dataflow_rules );
}
my $input_id = stringify($param_hash);
$job->input_id( $input_id );
......@@ -71,8 +85,9 @@ __DATA__
standaloneJob.pl is an eHive component script that
1. takes in a RunnableDB module,
2. creates a standalone job outside an eHive database by initializing parameters from command line arguments
2. creates a standalone job outside an eHive database by initializing parameters from command line arguments (ARRAY- and HASH- arguments can be passed+parsed too!)
3. and runs that job outside the database.
4. can optionally dataflow into tables fully defined by URLs
Naturally, only certain RunnableDB modules can be run using this script, and some database-related functionality will be lost.
=head1 USAGE EXAMPLES
......@@ -93,12 +108,17 @@ __DATA__
# Run a job with given parameters, but skip the write_output() step:
standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FailureTest -no_write -time_RUN=2 -time_WRITE_OUTPUT=3 -state=WRITE_OUTPUT -value=2
# Run a job and re-direct its dataflow into tables:
standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::JobFactory -inputfile foo.txt -delimiter '\t' -column_names "[ 'name', 'age' ]"
-flow_into "{ 2 => ['mysql://ensadmin:ensembl@127.0.0.1:2914/lg4_triggers/foo', 'mysql://ensadmin:ensembl@127.0.0.1:2914/lg4_triggers/bar'] }"
=head1 SCRIPT-SPECIFIC OPTIONS
-help : print this help
-debug <level> : turn on debug messages at <level>
-no_write : skip the execution of write_output() step this time
-reg_conf <path> : load registry entries from the given file (these entries may be needed by the RunnableDB itself)
-flow_out "<hash>" : defines the dataflow re-direction rules in a format similar to PipeConfig's - see the last example
NB: all other options will be passed to the runnable (leading dashes removed) and will constitute the parameters for the job.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment