Commit 5f218bee authored by Leo Gordon's avatar Leo Gordon
Browse files

pipeline_name is now automatically computed from ClassName; simplified...

pipeline_name is now automatically computed from ClassName; simplified workshop's example files and slides
parent 32a23668
......@@ -36,23 +36,6 @@ use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
=head2 default_options
Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
Redefines the current pipeline_name. There is also an invisible dependency on o('password') which has to be defined.
=cut
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'compress_files', # name used by the beekeeper to prefix job names on the farm
};
}
=head2 pipeline_analyses
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
......
......@@ -97,7 +97,7 @@ sub default_options {
'user' => $ENV{'EHIVE_USER'} || 'ensadmin',
'password' => $ENV{'EHIVE_PASS'} // $ENV{'ENSADMIN_PSW'} // $self->o('password'), # people will have to make an effort NOT to insert it into config files like .bashrc etc
'dbowner' => $ENV{'EHIVE_USER'} || $ENV{'USER'} || $self->o('dbowner'), # although it is very unlikely $ENV{USER} is not set
'pipeline_name' => 'hive_generic',
'pipeline_name' => $self->pipeline_name(),
'hive_use_triggers' => 0, # there have been a few cases of big pipelines misbehaving with triggers on, let's keep the default off.
'hive_use_param_stack' => 0, # do not reconstruct the calling stack of parameters by default (yet)
......@@ -356,6 +356,22 @@ sub pipeline_url {
}
sub pipeline_name {
my $self = shift @_;
my $pipeline_name = shift @_;
unless($pipeline_name) { # or turn the ClassName into pipeline_name:
$pipeline_name = ref($self); # get the original class name
$pipeline_name=~s/^.*:://; # trim the leading classpath prefix
$pipeline_name=~s/_conf$//; # trim the optional _conf from the end
}
$pipeline_name=~s/([[:lower:]])([[:upper:]])/${1}_${2}/g; # CamelCase into Camel_Case
return lc($pipeline_name);
}
=head2 process_options
Description : The method that does all the parameter parsing magic.
......@@ -394,6 +410,7 @@ sub run {
my $analysis_topup = $self->{'_extra_options'}{'analysis_topup'};
my $job_topup = $self->{'_extra_options'}{'job_topup'};
my $pipeline_url = $self->pipeline_url();
my $pipeline_name = $self->o('pipeline_name');
unless($analysis_topup || $job_topup) {
foreach my $cmd (@{$self->pipeline_create_commands}) {
......@@ -688,7 +705,7 @@ sub run {
print "\trunWorker.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()." \t\t# run exactly one Worker locally (useful for debugging/learning)\n";
print "\n";
print " # At any moment during or after execution you can request a pipeline diagram in an image file (desired format is set via extension) :\n";
print "\tgenerate_graph.pl -url $pipeline_url -out diagram.png\n";
print "\tgenerate_graph.pl -url $pipeline_url -out $pipeline_name.png\n";
print "\n";
print " # Peek into your pipeline database with a database client (useful to have open while the pipeline is running) :\n";
print "\tdb_cmd.pl -url $pipeline_url\n\n";
......
......@@ -7,32 +7,31 @@
=head1 SYNOPSIS
# Example 1: specifying only the mandatory option (numbers to be multiplied are taken from defaults)
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMultSt_conf -password <mypass>
# initialize the database and build the graph in it (it will also print the value of EHIVE_URL) :
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf -password <mypass>
# Example 2: specifying the mandatory options as well as overriding the default numbers to be multiplied:
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMultSt_conf -password <mypass> -first_mult 2344556 -second_mult 777666555
# optionally also seed it with your specific values:
seed_pipeline.pl -url $EHIVE_URL -logic_name take_b_apart -input_id '{ "a_multiplier" => "12345678", "b_multiplier" => "3359559666" }'
# Example 3: do not re-create the database, just load another multiplicaton task into an existing one:
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMultSt_conf -job_topup -password <mypass> -first_mult 1111222233334444 -second_mult 38578377835
# run the pipeline:
beekeeper.pl -url $EHIVE_URL -loop
=head1 DESCRIPTION
This is an experimental version of LongMult_conf with hive_use_param_stack switched on.
This is a special version of LongMult_conf with hive_use_param_stack mode switched on.
This is the PipeConfig file for the long multiplication pipeline example.
The main point of this pipeline is to provide an example of how to write Hive Runnables and link them together into a pipeline.
Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here.
The setting. Let's assume we are given two loooooong numbers to multiply. Reeeeally long.
So long that they do not fit into registers of the CPU and should be multiplied digit-by-digit.
Soooo long that they do not fit into registers of the CPU and should be multiplied digit-by-digit.
For the purposes of this example we also assume this task is very computationally intensive and has to be done in parallel.
The long multiplication pipeline consists of three "analyses" (types of tasks): 'take_b_apart', 'part_multiply' and 'add_together'
that we will be using to examplify various features of the Hive.
The long multiplication pipeline consists of three "analyses" (types of tasks):
'take_b_apart', 'part_multiply' and 'add_together' that we use to examplify various features of the Hive.
* A 'take_b_apart' job takes in two string parameters, 'a_multiplier' and 'b_multiplier',
takes the second one apart into digits, finds what _different_ digits are there,
......@@ -60,29 +59,6 @@ use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
=head2 default_options
Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
In addition to the standard things it defines two options, 'first_mult' and 'second_mult' that are supposed to contain the long numbers to be multiplied.
=cut
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'long_mult', # name used by the beekeeper to prefix job names on the farm
'first_mult' => '9650156169', # the actual numbers to be multiplied can also be specified from the command line
'second_mult' => '327358788',
'take_time' => 1, # how much time (in seconds) should each job take -- to slow things down
};
}
=head2 pipeline_create_commands
Description : Implements pipeline_create_commands() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands that will create and set up the Hive database.
......@@ -115,7 +91,7 @@ sub pipeline_wide_parameters {
return {
%{$self->SUPER::pipeline_wide_parameters}, # here we inherit anything from the base class
'take_time' => $self->o('take_time'),
'take_time' => 1,
};
}
......@@ -134,18 +110,16 @@ sub hive_meta_table {
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
Here it defines three analyses:
* 'take_b_apart' with two jobs (multiply 'first_mult' by 'second_mult' and vice versa - to check the commutativity of multiplivation).
* 'take_b_apart' that is auto-seeded with a pair of jobs (to check the commutativity of multiplication).
Each job will dataflow (create more jobs) via branch #2 into 'part_multiply' and via branch #1 into 'add_together'.
* 'part_multiply' initially without jobs (they will flow from 'take_b_apart')
* 'add_together' initially without jobs (they will flow from 'take_b_apart').
All 'add_together' jobs will wait for completion of 'part_multiply' jobs before their own execution (to ensure all data is available).
* 'part_multiply' with jobs fed from take_b_apart#2.
It multiplies input parameters 'a_multiplier' and 'digit' and dataflows 'partial_product' parameter into branch #1.
There are two control modes in this pipeline:
A. The default mode is to use the '2' and '1' dataflow rules from 'take_b_apart' analysis and a -wait_for rule in 'add_together' analysis for analysis-wide synchronization.
B. The semaphored mode is to use '2->A' and 'A->1' semaphored dataflow rules from 'take_b_apart' instead, and comment out the analysis-wide -wait_for rule, relying on semaphores.
* 'add_together' with jobs fed from take_b_apart#1.
It adds together results of partial multiplication computed by 'part_multiply'.
These results are accumulated in 'partial_product' hash.
Until the hash is complete the corresponding 'add_together' job is blocked by a semaphore.
=cut
......@@ -157,8 +131,8 @@ sub pipeline_analyses {
-meadow_type=> 'LOCAL', # do not bother the farm with such a simple task (and get it done faster)
-analysis_capacity => 2, # use per-analysis limiter
-input_ids => [
{ 'a_multiplier' => $self->o('first_mult'), 'b_multiplier' => $self->o('second_mult') },
{ 'a_multiplier' => $self->o('second_mult'), 'b_multiplier' => $self->o('first_mult') },
{ 'a_multiplier' => '9650156169', 'b_multiplier' => '327358788' },
{ 'a_multiplier' => '327358788', 'b_multiplier' => '9650156169' },
],
-flow_into => {
'2->A' => [ 'part_multiply' ], # will create a semaphored fan of jobs; will use param_stack mechanism to pass parameters around
......
......@@ -7,14 +7,14 @@
=head1 SYNOPSIS
# Example 1: specifying only the mandatory option (numbers to be multiplied are taken from defaults)
# initialize the database and build the graph in it (it will also print the value of EHIVE_URL) :
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf -password <mypass>
# Example 2: specifying the mandatory options as well as overriding the default numbers to be multiplied:
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf -password <mypass> -first_mult 2344556 -second_mult 777666555
# optionally also seed it with your specific values:
seed_pipeline.pl -url $EHIVE_URL -logic_name take_b_apart -input_id '{ "a_multiplier" => "12345678", "b_multiplier" => "3359559666" }'
# Example 3: do not re-create the database, just load another multiplicaton task into an existing one:
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf -job_topup -password <mypass> -first_mult 1111222233334444 -second_mult 38578377835
# run the pipeline:
beekeeper.pl -url $EHIVE_URL -loop
=head1 DESCRIPTION
......@@ -24,13 +24,12 @@ init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::LongMult_conf -job_topup -passw
Please refer to Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf module to understand the interface implemented here.
The setting. Let's assume we are given two loooooong numbers to multiply. Reeeeally long.
So long that they do not fit into registers of the CPU and should be multiplied digit-by-digit.
The setting. let's assume we are given two loooooong numbers to multiply. reeeeally long.
soooo long that they do not fit into registers of the cpu and should be multiplied digit-by-digit.
For the purposes of this example we also assume this task is very computationally intensive and has to be done in parallel.
The long multiplication pipeline consists of three "analyses" (types of tasks): 'take_b_apart', 'part_multiply' and 'add_together'
that we will be using to examplify various features of the Hive.
The long multiplication pipeline consists of three "analyses" (types of tasks):
'take_b_apart', 'part_multiply' and 'add_together' that we use to examplify various features of the Hive.
* A 'take_b_apart' job takes in two string parameters, 'a_multiplier' and 'b_multiplier',
takes the second one apart into digits, finds what _different_ digits are there,
......@@ -58,29 +57,6 @@ use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
=head2 default_options
Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
In addition to the standard things it defines two options, 'first_mult' and 'second_mult' that are supposed to contain the long numbers to be multiplied.
=cut
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'long_mult', # name used by the beekeeper to prefix job names on the farm
'first_mult' => '9650156169', # the actual numbers to be multiplied can also be specified from the command line
'second_mult' => '327358788',
'take_time' => 1, # how much time (in seconds) should each job take -- to slow things down
};
}
=head2 pipeline_create_commands
Description : Implements pipeline_create_commands() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands that will create and set up the Hive database.
......@@ -113,7 +89,7 @@ sub pipeline_wide_parameters {
return {
%{$self->SUPER::pipeline_wide_parameters}, # here we inherit anything from the base class
'take_time' => $self->o('take_time'),
'take_time' => 1,
};
}
......@@ -122,18 +98,16 @@ sub pipeline_wide_parameters {
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
Here it defines three analyses:
* 'take_b_apart' with two jobs (multiply 'first_mult' by 'second_mult' and vice versa - to check the commutativity of multiplivation).
* 'take_b_apart' that is auto-seeded with a pair of jobs (to check the commutativity of multiplication).
Each job will dataflow (create more jobs) via branch #2 into 'part_multiply' and via branch #1 into 'add_together'.
* 'part_multiply' initially without jobs (they will flow from 'take_b_apart')
* 'add_together' initially without jobs (they will flow from 'take_b_apart').
All 'add_together' jobs will wait for completion of 'part_multiply' jobs before their own execution (to ensure all data is available).
* 'part_multiply' with jobs fed from take_b_apart#2.
It multiplies input parameters 'a_multiplier' and 'digit' and dataflows 'partial_product' parameter into branch #1.
There are two control modes in this pipeline:
A. The default mode is to use the '2' and '1' dataflow rules from 'take_b_apart' analysis and a -wait_for rule in 'add_together' analysis for analysis-wide synchronization.
B. The semaphored mode is to use '2->A' and 'A->1' semaphored dataflow rules from 'take_b_apart' instead, and comment out the analysis-wide -wait_for rule, relying on semaphores.
* 'add_together' with jobs fed from take_b_apart#1.
It adds together results of partial multiplication computed by 'part_multiply'.
These results are accumulated in 'partial_product' hash.
Until the hash is complete the corresponding 'add_together' job is blocked by a semaphore.
=cut
......@@ -145,12 +119,14 @@ sub pipeline_analyses {
-meadow_type=> 'LOCAL', # do not bother the farm with such a simple task (and get it done faster)
-analysis_capacity => 2, # use per-analysis limiter
-input_ids => [
{ 'a_multiplier' => $self->o('first_mult'), 'b_multiplier' => $self->o('second_mult') },
{ 'a_multiplier' => $self->o('second_mult'), 'b_multiplier' => $self->o('first_mult') },
{ 'a_multiplier' => '9650156169', 'b_multiplier' => '327358788' },
{ 'a_multiplier' => '327358788', 'b_multiplier' => '9650156169' },
],
-flow_into => {
'2->A' => { 'part_multiply' => { 'a_multiplier' => '#a_multiplier#', 'digit' => '#digit#' } }, # will create a semaphored fan of jobs; will use a template to top-up the hashes
'A->1' => [ 'add_together' ], # will create a semaphored funnel job to wait for the fan to complete and add the results
# will create a semaphored fan of jobs; will use a template to top-up the hashes:
'2->A' => { 'part_multiply' => { 'a_multiplier' => '#a_multiplier#', 'digit' => '#digit#', 'take_time' => '#take_time#' } },
# will create a semaphored funnel job to wait for the fan to complete and add the results:
'A->1' => [ 'add_together' ],
},
},
......
......@@ -33,16 +33,6 @@ use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'memlimit_test', # name used by the beekeeper to prefix job names on the farm
};
}
sub resource_classes {
my ($self) = @_;
return {
......
......@@ -37,21 +37,6 @@ use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
=head2 default_options
Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
=cut
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'zip_tables', # name used by the beekeeper to prefix job names on the farm
};
}
=head2 pipeline_wide_parameters
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment