Commit 8cf4b04b authored by Leo Gordon's avatar Leo Gordon
Browse files

A cleaner example of a two-analysis pipelines with better demonstration of...

A cleaner example of a two-analysis pipelines with better demonstration of #substitution# and only implicit $self->o() references
parent dabde0fa
......@@ -3,17 +3,19 @@
=head1 NAME
Bio::EnsEMBL::Hive::PipeConfig::FileZipperUnzipper_conf
Bio::EnsEMBL::Hive::PipeConfig::CompressFiles_conf
=head1 SYNOPSIS
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::FileZipperUnzipper_conf -password <your_password> -directory $HOME/ncbi_taxonomy -unzip 1
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::CompressFiles_conf -password <your_password>
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::FileZipperUnzipper_conf -password <your_password> -directory directory_with_huge_dumps -only_files '*.sql'
seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => 'dumps', 'only_files' => '*.sql' }"
seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => '$HOME/ncbi_taxonomy', 'gzip_flags' => '-d' }"
=head1 DESCRIPTION
This is an example pipeline put together from basic building blocks:
This is an example pipeline put together from two basic building blocks:
Analysis_1: JobFactory.pm is used to turn the list of files in a given directory into jobs
......@@ -27,7 +29,8 @@
=cut
package Bio::EnsEMBL::Hive::PipeConfig::FileZipperUnzipper_conf;
package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles_conf;
use strict;
use warnings;
......@@ -37,82 +40,72 @@ use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive datab
=head2 default_options
Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
In addition to the standard things it defines three options:
o('unzip') controls whether the files will be zipped or unzipper (zipped by default)
o('only_files') defines which files in the directory will be (un)zipped
o('zipping_capacicy') defines how many files can be zipped in parallel
There are rules dependent on two options that do not have defaults (this makes them mandatory):
o('password') your read-write password for creation and maintenance of the hive database
o('directory') name of the directory where the files are to be (un)zipped
Redefines the current pipeline_name. There is also an invisible dependency on o('password') which has to be defined.
=cut
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'zip_unzip_files', # name used by the beekeeper to prefix job names on the farm
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'unzip' => 0, # set to '1' to switch to decompression
'only_files' => '*', # use '*.sql*' to only (un)zip these files
'zipping_capacity' => 10, # how many files can be (un)zipped in parallel
'pipeline_name' => 'compress_files', # name used by the beekeeper to prefix job names on the farm
};
}
=head2 pipeline_create_commands
Description : Implements pipeline_create_commands() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands that will create and set up the Hive database.
It is just the standard stuff, so we could have as well omitted this method altogether.
=head2 pipeline_wide_parameters
Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs.
The value doesn't have to be a scalar, can be any Perl structure (will be stringified and de-stringified automagically).
=cut
sub pipeline_create_commands {
sub pipeline_wide_parameters {
my ($self) = @_;
return [
@{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables' creation
];
return {
%{$self->SUPER::pipeline_wide_parameters}, # here we inherit anything from the base class, then add our own stuff
'gzip_flags' => '', # can be set to '-d' for decompression
'directory' => '.', # directory where both source and target files are located
'only_files' => '*', # any wildcard understood by shell
};
}
=head2 pipeline_analyses
Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
Here it defines two analyses:
* 'get_files' generates a list of files whose names match the pattern o('only_files')
Each job of this analysis will dataflow (create jobs) via branch #2 into 'zipper_unzipper' analysis.
* 'find_files' generates a list of files whose names match the pattern #only_files#
Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.
* 'zipper_unzipper' actually performs the (un)zipping of the files in parallel
* 'compress_a_file' actually performs the (un)zipping of the files in parallel
=cut
sub pipeline_analyses {
my ($self) = @_;
return [
{ -logic_name => 'get_files',
{ -logic_name => 'find_files',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
-parameters => {
'inputcmd' => 'find '.$self->o('directory').' -type f -name "'.$self->o('only_files').'"',
'inputcmd' => 'find #directory# -type f -name #only_files#',
'column_names' => [ 'filename' ],
},
-input_ids => [
{ }, # no need to define the template in simple cases like this
],
-flow_into => {
2 => [ 'zipper_unzipper' ], # will create a fan of jobs
# 2 => [ 'compress_a_file' ], # will create a fan of jobs
2 => { 'compress_a_file' => { 'filename' => '#filename#', 'gzip_flags' => '#gzip_flags#' }, }, # propagate 'gzip_flags' as well
},
},
{ -logic_name => 'zipper_unzipper',
{ -logic_name => 'compress_a_file',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
-parameters => {
'cmd' => 'gzip '.($self->o('unzip')?'-d ':'').'#filename#',
'cmd' => 'gzip #gzip_flags# #filename#',
},
-analysis_capacity => $self->o('zipping_capacity'), # allow several workers to perform identical tasks in parallel
-input_ids => [
# (jobs for this analysis will be flown_into via branch-2 from 'get_tables' jobs above)
],
-analysis_capacity => 4, # limit the number of workers that will be performing jobs in parallel
},
];
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment