Commit 1b933558 authored by Leo Gordon's avatar Leo Gordon
Browse files

dir_revhash is now an importable Util subroutine that is used by both Worker and JobFactory

parent e9226c6b
......@@ -40,6 +40,8 @@ package Bio::EnsEMBL::Hive::RunnableDB::JobFactory;
use strict;
use DBI;
use Bio::EnsEMBL::Hive::Utils ('dir_revhash'); # import dir_revhash
use base ('Bio::EnsEMBL::Hive::ProcessWithParams');
=head2 fetch_input
......@@ -67,6 +69,9 @@ sub fetch_input {
param('key_column'): If every line of your input is a list (it happens, for example, when your SQL returns multiple columns or you have set the 'delimiter' in file/cmd mode)
this is the way to say which column is undergoing 'ranging'
param('hashed_column_number'): if defined, turns 'hashed_column_number' into a dir_revhash and appends it to the list of fields.
# The following 4 parameters are mutually exclusive and define the source of ids for the jobs:
param('inputlist'); The list is explicitly given in the parameters, can be abbreviated: 'inputlist' => ['a'..'z']
......@@ -89,6 +94,8 @@ sub run {
my $key_column = $self->param('key_column') || 0;
my $delimiter = $self->param('delimiter');
my $hashed_column_number = $self->param('hashed_column_number'); # skip this step if undefined
my $inputlist = $self->param('inputlist');
my $inputfile = $self->param('inputfile');
my $inputquery = $self->param('inputquery');
......@@ -104,6 +111,17 @@ sub run {
_fisher_yates_shuffle_in_place($list);
}
if(defined($hashed_column_number) and scalar(@$list)) {
if(!ref($list->[0])) {
$list = [ map { [$_] } @$list ]; # create the second dimension if it was missing
}
foreach my $row (@$list) {
push @$row, dir_revhash($row->[$hashed_column_number]);
}
}
my $output_ids = $self->_split_list_into_ranges($template_hash, $list, $step, $key_column);
$self->param('output_ids', $output_ids);
}
......
......@@ -44,7 +44,7 @@ use warnings;
use Data::Dumper;
use Exporter 'import';
our @EXPORT_OK = qw( stringify destringify );
our @EXPORT_OK = qw( stringify destringify dir_revhash );
=head2 stringify
......@@ -96,5 +96,24 @@ sub destringify {
return $value;
}
=head2 dir_revhash
Description: This function takes in a string (which is usually a numeric id) and turns its reverse into a multilevel directory hash.
Please note that no directory is created at this step - it is purely a string conversion function.
Callers : Bio::EnsEMBL::Hive::Worker # hashing of the worker output directories
Bio::EnsEMBL::Hive::RunnableDB::JobFactory # hashing of an arbitrary id
=cut
sub dir_revhash {
my $id = pop @_;
my @dirs = reverse(split(//, $id));
pop @dirs; # do not use the first digit for hashing
return join('/', @dirs);
}
1;
......@@ -78,6 +78,8 @@ use Bio::EnsEMBL::Hive::DBSQL::DataflowRuleAdaptor;
use Bio::EnsEMBL::Hive::Extensions;
use Bio::EnsEMBL::Hive::Process;
use Bio::EnsEMBL::Hive::Utils ('dir_revhash'); # import dir_revhash
## Minimum amount of time in msec that a worker should run before reporting
## back to the hive. This is used when setting the batch_size automatically.
## 120000 msec = 2 minutes
......@@ -324,11 +326,8 @@ sub worker_output_dir {
} elsif( my $hive_output_dir = $self->hive_output_dir ) {
my $worker_id = $self->worker_id();
my @dirs = ( $hive_output_dir, reverse(split(//, $worker_id)) );
pop @dirs; # do not use the first digit for hashing
push @dirs, "worker_id_${worker_id}";
$worker_output_dir = join('/', @dirs);
$worker_output_dir = join('/', $hive_output_dir, dir_revhash($worker_id), 'worker_id_'.$worker_id );
}
if($worker_output_dir) { # will not attempt to create if set to false
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment