a Bio::Seq example factory Runnable and a matching PipeConfig file

5cb1771d · Leo Gordon · f0edaecf · 5cb1771d · 5cb1771d
Commit 5cb1771d authored 13 years ago by Leo Gordon
--- a/modules/Bio/EnsEMBL/Hive/PipeConfig/FastaSplitter_conf.pm
+++ b/modules/Bio/EnsEMBL/Hive/PipeConfig/FastaSplitter_conf.pm
+
+=pod
+
+=head1 NAME
+
+  Bio::EnsEMBL::Hive::PipeConfig::FastaSplitter_conf
+
+=head1 SYNOPSIS
+
+    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::FastaSplitter_conf -inputfile reference.fasta -chunks_dir reference_chunks
+
+=cut
+
+package Bio::EnsEMBL::Hive::PipeConfig::FastaSplitter_conf;
+
+use strict;
+use warnings;
+
+use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
+
+
+sub default_options {
+    my ($self) = @_;
+    return {
+        %{ $self->SUPER::default_options() },               # inherit other stuff from the base class
+
+        'pipeline_name' => 'split_fasta',                   # name used by the beekeeper to prefix job names on the farm
+
+            # runnable-specific parameters' defaults:
+        'max_chunk_length'  => 500000,
+        'output_prefix'     => 'chunk_number_',
+        'output_suffix'     => '.fasta',
+
+        'chunks_dir'        => 'fasta_split_chunks',
+    };
+}
+
+
+sub pipeline_create_commands {
+    my ($self) = @_;
+    return [
+        @{$self->SUPER::pipeline_create_commands},  # inheriting database and hive tables' creation
+
+        'mkdir -p '.$self->o('chunks_dir'),
+    ];
+}
+
+
+sub pipeline_analyses {
+    my ($self) = @_;
+    return [
+        {   -logic_name => 'split_fasta',
+            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::FastaFactory',
+            -input_ids => [ {
+                'inputfile'         => $self->o('inputfile'),
+                'max_chunk_length'  => $self->o('max_chunk_length'),
+                'output_prefix'     => $self->o('chunks_dir').'/'.$self->o('output_prefix'),
+                'output_suffix'     => $self->o('output_suffix'),
+            } ],
+            -flow_into => {
+                2 => [ 'align' ],   # will create a fan of jobs
+            },
+        },
+
+        {   -logic_name    => 'align',
+            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
+        },
+    ];
+}
+
+1;
+
--- a/modules/Bio/EnsEMBL/Hive/RunnableDB/FastaFactory.pm
+++ b/modules/Bio/EnsEMBL/Hive/RunnableDB/FastaFactory.pm
+
+=pod 
+
+=head1 NAME
+
+Bio::EnsEMBL::Hive::RunnableDB::FastaFactory
+
+=head1 SYNOPSIS
+
+    standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FastaFactory --inputfile reference.fasta --max_chunk_length 600000
+
+    standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FastaFactory \
+                    --inputfile reference.fasta \
+                    --max_chunk_length 700000 \
+                    --output_prefix ref_chunk \
+                    --flow_into "{ 2 => ['mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1/lg4_split_fasta/analysis?logic_name=blast']}"
+
+=head1 DESCRIPTION
+
+This is a Bioinformatics-specific "Factory" Runnable that splits a given Fasta file into smaller chunks
+and dataflows one job per chunk.
+
+The following parameters are supported:
+
+    param('inputfile');         # The original Fasta file: 'inputfile' => 'my_sequences.fasta'
+
+    param('max_chunk_length');  # Maximum total length of sequences in a chunk: 'max_chunk_length' => '200000'
+
+    param('output_prefix');     # A common prefix for output files: 'output_prefix' => 'my_special_chunk_'
+
+    param('output_suffix');     # A common suffix for output files: 'output_suffix' => '.nt'
+
+=head1 CONTACT
+
+  Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.
+
+=cut
+
+
+package Bio::EnsEMBL::Hive::RunnableDB::FastaFactory;
+
+use strict;
+
+use base ('Bio::EnsEMBL::Hive::Process');
+use Bio::SeqIO;
+
+
+=head2 param_defaults
+
+    Description : Implements param_defaults() interface method of Bio::EnsEMBL::Hive::Process that defines module defaults for parameters.
+
+=cut
+
+sub param_defaults {
+
+    return {
+        'max_chunk_length'  => 100000,
+        'output_prefix'     => 'my_chunk_',
+        'output_suffix'     => '.fasta',
+    };
+}
+
+
+=head2 fetch_input
+
+    Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
+                    Here we only check the existence of 'inputfile' parameter and try to parse it (all other parameters have defaults).
+
+=cut
+
+sub fetch_input {
+    my $self = shift @_;
+
+    my $inputfile   = $self->param('inputfile')                 || die "'inputfile' is an obligatory parameter";
+    my $input_seqio = Bio::SeqIO->new(-file => '<'.$inputfile)  || die "Could not open or parse '$inputfile', please investigate";
+
+    $self->param('input_seqio', $input_seqio);
+}
+
+
+=head2 run
+
+    Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
+                    Because we want to stream the data more efficiently, all functionality is in write_output();
+
+=cut
+
+sub run {
+}
+
+
+=head2 write_output
+
+    Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
+                    The main bulk of this Runnable's functionality is here.
+                    Iterates through all sequences in input_seqio, splits them into separate files ("chunks") using a cut-off length and dataflows one job per chunk.
+
+=cut
+
+sub write_output {
+    my $self = shift @_;
+
+    my $input_seqio         = $self->param('input_seqio');
+    my $max_chunk_length    = $self->param('max_chunk_length');
+    my $output_prefix       = $self->param('output_prefix');
+    my $output_suffix       = $self->param('output_suffix');
+
+    my $chunk_number = 1;   # counts the chunks
+    my $chunk_length = 0;   # total length of the current chunk
+    my $chunk_size   = 0;   # number of sequences in the current chunk
+    my $chunk_name   = $output_prefix.$chunk_number.$output_suffix;
+    my $chunk_seqio  = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => 'fasta');
+
+    while (my $seq_object = $input_seqio->next_seq) {
+        if((my $seq_length = $seq_object->length()) + $chunk_length <= $max_chunk_length) {
+
+                # add to the current chunk:
+            $chunk_seqio->write_seq( $seq_object );
+            $chunk_length += $seq_length;
+            $chunk_size   += 1;
+        } else {
+
+                # dataflow the current chunk:
+            $self->dataflow_output_id( {
+                'chunk_name' => $chunk_name,
+                'chunk_number' => $chunk_number,
+                'chunk_length' => $chunk_length,
+                'chunk_size' => $chunk_size
+            }, 2);
+
+                # start writing to the next one:
+            $chunk_length   = 0;
+            $chunk_size     = 0;
+            $chunk_number++;
+            $chunk_name     = $output_prefix.$chunk_number.$output_suffix;
+            $chunk_seqio    = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => 'fasta');
+        }
+    }
+
+    if($chunk_size) {   # flush the last chunk:
+
+        $self->dataflow_output_id( {
+            'chunk_name' => $chunk_name,
+            'chunk_number' => $chunk_number,
+            'chunk_length' => $chunk_length,
+            'chunk_size' => $chunk_size
+        }, 2);
+    }
+}
+
+1;
+