Commit c2eb1bf5 authored by Leo Gordon's avatar Leo Gordon
Browse files

merge the two ways of running the LongMult pipeline into one

parent 486e481c
......@@ -109,7 +109,11 @@ sub pipeline_create_commands {
* 'part_multiply' initially without jobs (they will flow from 'start')
* 'add_together' initially without jobs (they will flow from 'start').
All 'add_together' jobs will wait for completion of *all* 'part_multiply' jobs before their own execution (to ensure all data is available).
All 'add_together' jobs will wait for completion of 'part_multiply' jobs before their own execution (to ensure all data is available).
There are two control modes in this pipeline:
A. The default mode is to use the '2' dataflow rule from 'start' analysis and a -wait_for rule in 'add_together' analysis for analysis-wide synchronization.
B. The semaphored mode is to use '2:1' semaphored dataflow rule from 'start' instead, and comment out the analysis-wide -wait_for rule, relying on semaphores.
=cut
......@@ -124,7 +128,8 @@ sub pipeline_analyses {
{ 'a_multiplier' => $self->o('second_mult'), 'b_multiplier' => $self->o('first_mult') },
],
-flow_into => {
'2:1' => [ 'part_multiply' ], # will create a fan of jobs
2 => [ 'part_multiply' ], # will create a fan of jobs
# '2:1' => [ 'part_multiply' ], # will create a semaphored fan of jobs (comment out the -wait_for rule from 'add_together')
1 => [ 'add_together' ], # will create a funnel job to wait for the fan to complete and add the results
},
},
......@@ -147,7 +152,7 @@ sub pipeline_analyses {
-input_ids => [
# (jobs for this analysis will be flown_into via branch-1 from 'start' jobs above)
],
# -wait_for => [ 'part_multiply' ], # we can only start adding when all partial products have been computed
-wait_for => [ 'part_multiply' ], # we can only start adding when all partial products have been computed
-flow_into => {
1 => [ ':////final_result' ],
},
......
=pod
=head1 NAME
Bio::EnsEMBL::Hive::PipeConfig::SemaLongMult_conf;
=head1 SYNOPSIS
# Example 1: specifying only the mandatory option:
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::SemaLongMult_conf -password <mypass>
# Example 2: specifying the mandatory options as well as overriding some defaults:
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::SemaLongMult_conf -password <mypass> -first_mult 2344556 -second_mult 777666555
=head1 DESCRIPTION
This is the PipeConfig file for the *semaphored* long multiplication pipeline example.
The main point of this pipeline is to provide an example of how to set up job-level semaphored control instead of using analysis-level control rules.
Please refer to Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf to understand how long multiplication pipeline works in its original form.
=head1 CONTACT
Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.
=cut
package Bio::EnsEMBL::Hive::PipeConfig::SemaLongMult_conf;
use strict;
use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
=head2 default_options
Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
In addition to the standard things it defines two options, 'first_mult' and 'second_mult' that are supposed to contain the long numbers to be multiplied.
=cut
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'sema_long_mult', # name used by the beekeeper to prefix job names on the farm
'first_mult' => '9650516169', # the actual numbers to be multiplied can also be specified from the command line
'second_mult' => '327358788',
};
}
=head2 pipeline_create_commands
Description : Implements pipeline_create_commands() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands that will create and set up the Hive database.
In addition to the standard creation of the database and populating it with Hive tables and procedures it also creates two pipeline-specific tables used by Runnables to communicate.
=cut
sub pipeline_create_commands {
my ($self) = @_;
return [
@{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables' creation
# additional tables needed for long multiplication pipeline's operation:
'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE intermediate_result (a_multiplier char(40) NOT NULL, digit tinyint NOT NULL, result char(41) NOT NULL, PRIMARY KEY (a_multiplier, digit))'",
'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE final_result (a_multiplier char(40) NOT NULL, b_multiplier char(40) NOT NULL, result char(80) NOT NULL, PRIMARY KEY (a_multiplier, b_multiplier))'",
];
}
=head2 pipeline_analyses
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
Here it defines three analyses:
* 'sema_start' with two jobs (multiply 'first_mult' by 'second_mult' and vice versa - to check the commutativity of multiplivation).
Each job will dataflow (create more jobs) via branch #2 into 'part_multiply' and via branch #1 into 'add_together'.
Unlike LongMult_conf, there is no analysis-level control here, but SemaStart analysis itself is more intelligent
in that it can dataflow a group of partial multiplication jobs in branch #2 linked with one job in branch #1 by a semaphore.
* 'part_multiply' initially without jobs (they will flow from 'start')
* 'add_together' initially without jobs (they will flow from 'start').
Unlike LongMult_conf here we do not use analysis control and rely on job-level semaphores to keep the jobs in sync.
=cut
sub pipeline_analyses {
my ($self) = @_;
return [
{ -logic_name => 'sema_start',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::LongMult::SemaStart',
-parameters => {},
-input_ids => [
{ 'a_multiplier' => $self->o('first_mult'), 'b_multiplier' => $self->o('second_mult') },
{ 'a_multiplier' => $self->o('second_mult'), 'b_multiplier' => $self->o('first_mult') },
],
-flow_into => {
2 => [ 'part_multiply' ], # will create a fan of jobs
1 => [ 'add_together' ], # will create a funnel job to wait for the fan to complete and add the results
},
},
{ -logic_name => 'part_multiply',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::LongMult::PartMultiply',
-parameters => {},
-input_ids => [
# (jobs for this analysis will be flown_into via branch-2 from 'sema_start' jobs above)
],
-flow_into => {
1 => [ 'mysql:////intermediate_result' ],
},
},
{ -logic_name => 'add_together',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::LongMult::AddTogether',
-parameters => {},
-input_ids => [
# (jobs for this analysis will be flown_into via branch-1 from 'sema_start' jobs above)
],
# jobs in this analyses are semaphored, so no need to '-wait_for'
-flow_into => {
1 => [ 'mysql:////final_result' ],
},
},
];
}
1;
......@@ -3,16 +3,14 @@
# Bio::EnsEMBL::Hive::RunnableDB::LongMult is an example eHive pipeline.
#
#
# Please have a look at the pipeline configuration files
# Please have a look at the pipeline configuration file
#
# ensembl-hive/modules/Bio/EnsEMBL/Hive/PipeConfig/LongMult_conf.pm
# and
# ensembl-hive/modules/Bio/EnsEMBL/Hive/PipeConfig/SemaLongMult_conf.pm
#
# which are used to load the Long Multiplication pipeline in "analysis control" and "semaphore job control" modes respectively.
# which is used to load the Long Multiplication pipeline either in "analysis control" or "semaphore job control" mode.
#
#
# Create these pipelines using init_pipeline.pl and run them using beekeeper.pl in step-by-step mode (use -run instead of -loop option).
# Create this pipeline using init_pipeline.pl and run it using beekeeper.pl in step-by-step mode (use -run instead of -loop option).
#
###########################################################################################################################################
=pod
=head1 NAME
Bio::EnsEMBL::Hive::RunnableDB::LongMult::SemaStart
=head1 SYNOPSIS
Please refer to Bio::EnsEMBL::Hive::PipeConfig::SemaLongMult_conf pipeline configuration file
to understand how this particular example pipeline is configured and ran.
=head1 DESCRIPTION
'LongMult::SemaStart' is an alternative first step of the LongMult example pipeline that multiplies two long numbers.
In the same manner as 'LongMult::Start', it takes apart the second multiplier and creates several 'LongMult::PartMultiply' jobs
that correspond to the different digits of the second multiplier.
However, instead of using by-analysis control mechanisms (control-flow and data-flow rules)
it uses counting semaphores as a less coarse by-job control mechanism,
which allows several different multiplications to run independently of each other.
=cut
package Bio::EnsEMBL::Hive::RunnableDB::LongMult::SemaStart;
use strict;
use base ('Bio::EnsEMBL::Hive::Process');
=head2 fetch_input
Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
Here the task of fetch_input() is to read in the two multipliers, split the second one into digits and create a set of input_ids that will be used later.
param('a_multiplier'): The first long number (a string of digits - doesn't have to fit a register).
param('b_multiplier'): The second long number (also a string of digits).
=cut
sub fetch_input {
my $self = shift @_;
my $a_multiplier = $self->param('a_multiplier') || die "'a_multiplier' is an obligatory parameter";
my $b_multiplier = $self->param('b_multiplier') || die "'b_multiplier' is an obligatory parameter";
my %digit_hash = ();
foreach my $digit (split(//,$b_multiplier)) {
next if (($digit eq '0') or ($digit eq '1'));
$digit_hash{$digit}++;
}
# output_ids of partial multiplications to be computed:
my @output_ids = map { { 'a_multiplier' => $a_multiplier, 'digit' => $_ } } keys %digit_hash;
# store them for future use:
$self->param('output_ids', \@output_ids);
}
=head2 run
Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
Here we don't have any real work to do, just input and output, so run() remains empty.
=cut
sub run {
}
=head2 write_output
Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
Here we first dataflow the original task down branch-1 (create the semaphored "funnel job") - this yields $funnel_job_id,
then "fan out" the partial multiplication tasks into branch-2, and pass the $funnel_job_id to all of them.
=cut
sub write_output { # nothing to write out, but some dataflow to perform:
my $self = shift @_;
my $output_ids = $self->param('output_ids');
# first we flow the branch#1 into the (semaphored) funnel job:
my ($funnel_job_id) = @{ $self->dataflow_output_id($self->input_id, 1, { -semaphore_count => scalar(@$output_ids) }) };
# then we fan out into branch#2, and pass the $funnel_job_id to all of them
my $fan_job_ids = $self->dataflow_output_id($output_ids, 2, { -semaphored_job_id => $funnel_job_id } );
$self->warning(scalar(@$output_ids).' multiplication jobs have been created'); # warning messages get recorded into 'job_message' table
}
1;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment