From e2ac50d84a9e09e33b3f6cd7112a801f97556cd6 Mon Sep 17 00:00:00 2001 From: Leo Gordon <lg4@ebi.ac.uk> Date: Sat, 23 Feb 2013 00:52:57 +0000 Subject: [PATCH] JobFactory uses $overriding_hash to create jobs/rows from input_id_template; 'input_id' parameter deprecated; standaloneJob supports templates. --- .../Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm | 93 +++++++++---------- scripts/standaloneJob.pl | 16 +++- 2 files changed, 60 insertions(+), 49 deletions(-) diff --git a/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm b/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm index 6d0efb6e4..864fcb8bc 100644 --- a/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm +++ b/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm @@ -9,8 +9,7 @@ Bio::EnsEMBL::Hive::RunnableDB::JobFactory standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::JobFactory \ --inputcmd 'cd ${ENSEMBL_CVS_ROOT_DIR}/ensembl-hive/modules/Bio/EnsEMBL/Hive/RunnableDB; ls -1 *.pm' \ - --input_id "{'meta_key'=>'module_name','meta_value'=>'#_0#'}" \ - --flow_into "{ 2 => ['mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1:2912/lg4_compara_families_64/meta']}" + --flow_into "{ 2 => { 'mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1:2914/lg4_compara_families_70/meta' => {'meta_key'=>'module_name','meta_value'=>'#_0#'} } }"" =head1 DESCRIPTION @@ -37,6 +36,25 @@ use strict; use base ('Bio::EnsEMBL::Hive::Process'); +sub param_defaults { + return { + 'column_names' => 0, + 'delimiter' => undef, + 'randomize' => 0, + 'step' => 0, + 'key_column' => 0, + 'input_id' => 0, # this parameter is no longer supported and should stay at 0 + + 'inputlist' => undef, + 'inputfile' => undef, + 'inputquery' => undef, + 'inputcmd' => undef, + + 'fan_branch_code' => 2, + }; +} + + =head2 fetch_input Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data. @@ -58,10 +76,7 @@ use base ('Bio::EnsEMBL::Hive::Process'); param('column_names'): Controls the column names that come out of the parser: 0 = "no names", 1 = "parse names from data", arrayref = "take names from this array" - param('delimiter'): If you set it your lines in file/cmd mode will be split into columns that you can use individually when constructing the template input_id hash. - - param('input_id'): The template that will become the input_id of newly created jobs (Note: this is something entirely different from $self->input_id of the current JobFactory job). - After introduction of param('column_names') its significance has dropped, but it may still become handy. + param('delimiter'): If you set it your lines in file/cmd mode will be split into columns that you can use individually when constructing the input_id_template hash. param('randomize'): Shuffles the rows before creating jobs - can sometimes lead to better overall performance of the pipeline. Doesn't make any sence for minibatches (step>1). @@ -86,14 +101,14 @@ use base ('Bio::EnsEMBL::Hive::Process'); sub run { my $self = shift @_; - my $column_names = $self->param('column_names') || 0; # can be 0 (no names), 1 (names from data) or an arrayref (names from this array) + my $column_names = $self->param('column_names'); # can be 0 (no names), 1 (names from data) or an arrayref (names from this array) my $delimiter = $self->param('delimiter'); - my $randomize = $self->param('randomize') || 0; + my $randomize = $self->param('randomize'); # minibatching-related: - my $step = $self->param('step') || 0; - my $key_column = $self->param('key_column') || 0; + my $step = $self->param('step'); + my $key_column = $self->param('key_column'); my $inputlist = $self->param('inputlist'); my $inputfile = $self->param('inputfile'); @@ -116,12 +131,8 @@ sub run { } # after this point $column_names should either contain a list or be false - my $template_hash = $self->param('input_id'); - unless($template_hash or $column_names) { - die "At least one of 'input_id' or 'column_names' has to be defined"; - } - unless($step ? $template_hash : 1) { - die "If 'step' is defined, 'input_id' also must be defined"; + if( $self->param('input_id') ) { + die "'input_id' is no longer supported, please reconfigure as the input_id_template of the dataflow_rule"; } if($randomize) { @@ -129,8 +140,8 @@ sub run { } my $output_ids = $step - ? $self->_substitute_minibatched_rows($rows, $column_names, $template_hash, $step, $key_column) - : $self->_substitute_rows($rows, $column_names, $template_hash); + ? $self->_substitute_minibatched_rows($rows, $column_names, $step, $key_column) + : $self->_substitute_rows($rows, $column_names); $self->param('output_ids', $output_ids); } @@ -149,7 +160,7 @@ sub write_output { # nothing to write out, but some dataflow to perform: my $self = shift @_; my $output_ids = $self->param('output_ids'); - my $fan_branch_code = $self->param('fan_branch_code') || 2; + my $fan_branch_code = $self->param('fan_branch_code'); # "fan out" into fan_branch_code: $self->dataflow_output_id($output_ids, $fan_branch_code); @@ -236,25 +247,16 @@ sub _get_rows_from_open { =cut sub _substitute_rows { - my ($self, $rows, $column_names, $template_hash) = @_; + my ($self, $rows, $column_names) = @_; my @hashes = (); foreach my $row (@$rows) { - if($template_hash) { - $self->param('_', $row); # the whole row as a list + my $job_param_hash = $column_names + ? { map { ($column_names->[$_] => $row->[$_]) } (0..scalar(@$row)-1) } + : { '_' => $row, map { ("_$_" => $row->[$_]) } (0..scalar(@$row)-1) }; - foreach my $i (0..scalar(@$row)-1) { - $self->param("_$i", $row->[$i]); - - if($column_names) { - $self->param($column_names->[$i], $row->[$i]); - } - } - push @hashes, $self->param_substitute($template_hash); - } else { - push @hashes, { map { ($column_names->[$_] => $row->[$_]) } (0..scalar(@$row)-1) }; - } + push @hashes, $job_param_hash; } return \@hashes; } @@ -267,7 +269,7 @@ sub _substitute_rows { =cut sub _substitute_minibatched_rows { - my ($self, $rows, $column_names, $template_hash, $step, $key_column) = @_; + my ($self, $rows, $column_names, $step, $key_column) = @_; my @ranges = (); @@ -293,21 +295,16 @@ sub _substitute_minibatched_rows { } } - # pseudo-parameters that will be substituted in the template hash: - $self->param('_range_start', $range_start); - $self->param('_range_end', $range_end); - $self->param('_range_count', $range_count); + my $job_range = { + '_range_start' => $range_start, + '_range_end' => $range_end, + '_range_count' => $range_count, - foreach my $i (0..scalar(@$start_row)-1) { - $self->param("_start_$i", $start_row->[$i]); - $self->param("_end_$i", $next_row->[$i]); - - if($column_names) { - $self->param('_start_'.$column_names->[$i], $start_row->[$i]); - $self->param('_end_'.$column_names->[$i], $next_row->[$i]); - } - } - push @ranges, $self->param_substitute($template_hash); + $column_names + ? map { ('_start_'.$column_names->[$_] => $start_row->[$_], '_end_'.$column_names->[$_] => $next_row->[$_]) } (0..scalar(@$start_row)-1) + : map { ("_start_$_" => $start_row->[$_], "_end_$_" => $next_row->[$_]) } (0..scalar(@$start_row)-1) + }; + push @ranges, $job_range; } return \@ranges; } diff --git a/scripts/standaloneJob.pl b/scripts/standaloneJob.pl index 1f6da8110..d5dfe5c95 100755 --- a/scripts/standaloneJob.pl +++ b/scripts/standaloneJob.pl @@ -51,8 +51,22 @@ foreach my $branch_code (keys %$flow_into) { my $heirs = $flow_into->{$branch_code}; $heirs = [ $heirs ] unless(ref($heirs)); # force scalar into an arrayref first + $heirs = { map { ($_ => undef) } @$heirs } if(ref($heirs) eq 'ARRAY'); # now force it into a hash if it wasn't - my @dataflow_rules = map { Bio::EnsEMBL::Hive::DataflowRule->new( -to_analysis_url => $_ ) } @$heirs; + my @dataflow_rules = (); + + while(my ($heir_url, $input_id_template_list) = each %$heirs) { + + $input_id_template_list = [ $input_id_template_list ] unless(ref($input_id_template_list) eq 'ARRAY'); # allow for more than one template per analysis + + foreach my $input_id_template (@$input_id_template_list) { + + push @dataflow_rules, Bio::EnsEMBL::Hive::DataflowRule->new( + -to_analysis_url => $heir_url, + -input_id_template => $input_id_template, + ); + } + } $job->dataflow_rules( $branch_code, \@dataflow_rules ); } -- GitLab