From 7522db1dc20b4bd712794faddf2593d31da09f8d Mon Sep 17 00:00:00 2001 From: Leo Gordon <lg4@ebi.ac.uk> Date: Thu, 3 Jun 2010 14:54:08 +0000 Subject: [PATCH] JobFactory can now generate input_ids from multi-column input; splitting on param("delimiter") has not been tested though --- .../Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm | 79 +++++++++++-------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm b/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm index e205e4822..42117ffde 100644 --- a/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm +++ b/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm @@ -62,6 +62,11 @@ sub fetch_input { param('randomize'): Shuffles the ids before creating jobs - can sometimes lead to better overall performance of the pipeline. Doesn't make any sence for minibatches (step>1). + param('delimiter'): If you set it your lines in file/cmd mode will be split into columns that you can use individually when constructing the template input_id hash. + + param('key_column'): If every line of your input is a list (it happens, for example, when your SQL returns multiple columns or you have set the 'delimiter' in file/cmd mode) + this is the way to say which column is undergoing 'ranging' + # The following 4 parameters are mutually exclusive and define the source of ids for the jobs: param('inputlist'); The list is explicitly given in the parameters, can be abbreviated: 'inputlist' => ['a'..'z'] @@ -81,22 +86,25 @@ sub run { my $step = $self->param('step') || 1; my $randomize = $self->param('randomize') || 0; + my $key_column = $self->param('key_column') || 0; + my $delimiter = $self->param('delimiter'); + my $inputlist = $self->param('inputlist'); my $inputfile = $self->param('inputfile'); my $inputquery = $self->param('inputquery'); my $inputcmd = $self->param('inputcmd'); my $list = $self->param_substitute( $inputlist ) - || ($inputfile && $self->_make_list_from_file( $self->param_substitute( $inputfile ) )) || ($inputquery && $self->_make_list_from_query( $self->param_substitute( $inputquery ) )) - || ($inputcmd && $self->_make_list_from_cmd( $self->param_substitute( $inputcmd ) )) + || ($inputfile && $self->_make_list_from_open( $self->param_substitute( $inputfile ), $delimiter )) + || ($inputcmd && $self->_make_list_from_open( $self->param_substitute( $inputcmd ).' |', $delimiter )) || die "range of values should be defined by setting 'inputlist', 'inputfile' or 'inputquery'"; if($randomize) { _fisher_yates_shuffle_in_place($list); } - my $output_ids = $self->_split_list_into_ranges($template_hash, $list, $step); + my $output_ids = $self->_split_list_into_ranges($template_hash, $list, $step, $key_column); $self->param('output_ids', $output_ids); } @@ -135,23 +143,6 @@ sub write_output { # nothing to write out, but some dataflow to perform: ################################### main functionality starts here ################### -=head2 _make_list_from_file - - Description: this is a private method that loads ids from a given file - -=cut - -sub _make_list_from_file { - my ($self, $inputfile) = @_; - - open(FILE, $inputfile) or die $!; - my @lines = <FILE>; - chomp @lines; - close(FILE); - - return \@lines; -} - =head2 _make_list_from_query Description: this is a private method that loads ids from a given sql query @@ -170,30 +161,36 @@ sub _make_list_from_query { $dbc = $self->db->dbc; } - my @ids = (); + my @list = (); my $sth = $dbc->prepare($inputquery); $sth->execute(); - while (my ($id)=$sth->fetchrow_array()) { - push @ids, $id; + while (my @cols = $sth->fetchrow_array()) { + push @list, scalar(@cols)==1 ? $cols[0] : \@cols; } $sth->finish(); - return \@ids; + return \@list; } -=head2 _make_list_from_cmd +=head2 _make_list_from_open - Description: this is a private method that loads ids from a given command line + Description: this is a private method that loads ids from a given file or command pipe =cut -sub _make_list_from_cmd { - my ($self, $inputcmd) = @_; +sub _make_list_from_open { + my ($self, $input_file_or_pipe, $delimiter) = @_; - my @lines = `$inputcmd`; - chomp @lines; + my @list = (); + open(FILE, $input_file_or_pipe) or die "Could not open '$input_file_or_pipe' because: $!"; + while(my $line = <FILE>) { + chomp $line; - return \@lines; + push @list, defined($delimiter) ? [ split(/$delimiter/, $line) ] : $line; + } + close FILE; + + return \@list; } =head2 _split_list_into_ranges @@ -203,22 +200,27 @@ sub _make_list_from_cmd { =cut sub _split_list_into_ranges { - my ($self, $template_hash, $list, $step) = @_; + my ($self, $template_hash, $list, $step, $key_column) = @_; my @ranges = (); while(@$list) { - my $range_start = shift @$list; + my $start_line = shift @$list; + my $range_start = (ref($start_line) eq 'ARRAY') ? $start_line->[$key_column] : $start_line; + my $range_end = $range_start; my $range_count = 1; + my $next_line = $start_line; # safety, in case next while doesn't execute even once while($range_count<$step && @$list) { - my $next_value = shift @$list; + $next_line = shift @$list; + my $next_value = (ref($next_line) eq 'ARRAY') ? $next_line->[$key_column] : $next_line; + my $predicted_next = $range_end; if(++$predicted_next eq $next_value) { $range_end = $next_value; $range_count++; } else { - unshift @$list, $next_value; + unshift @$list, $next_line; last; } } @@ -228,6 +230,13 @@ sub _split_list_into_ranges { $self->param('_range_end', $range_end); $self->param('_range_count', $range_count); + if(ref($start_line) eq 'ARRAY') { + foreach my $i (0..scalar(@$start_line)-1) { + $self->param("_start_$i", $start_line->[$i]); + $self->param("_end_$i", $next_line->[$i]); + } + } + push @ranges, $self->param_substitute($template_hash); } return \@ranges; -- GitLab