Commit 5cb1771d authored by Leo Gordon's avatar Leo Gordon
Browse files

a Bio::Seq example factory Runnable and a matching PipeConfig file

parent f0edaecf
=pod
=head1 NAME
Bio::EnsEMBL::Hive::PipeConfig::FastaSplitter_conf
=head1 SYNOPSIS
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::FastaSplitter_conf -inputfile reference.fasta -chunks_dir reference_chunks
=cut
package Bio::EnsEMBL::Hive::PipeConfig::FastaSplitter_conf;
use strict;
use warnings;
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
sub default_options {
my ($self) = @_;
return {
%{ $self->SUPER::default_options() }, # inherit other stuff from the base class
'pipeline_name' => 'split_fasta', # name used by the beekeeper to prefix job names on the farm
# runnable-specific parameters' defaults:
'max_chunk_length' => 500000,
'output_prefix' => 'chunk_number_',
'output_suffix' => '.fasta',
'chunks_dir' => 'fasta_split_chunks',
};
}
sub pipeline_create_commands {
my ($self) = @_;
return [
@{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables' creation
'mkdir -p '.$self->o('chunks_dir'),
];
}
sub pipeline_analyses {
my ($self) = @_;
return [
{ -logic_name => 'split_fasta',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::FastaFactory',
-input_ids => [ {
'inputfile' => $self->o('inputfile'),
'max_chunk_length' => $self->o('max_chunk_length'),
'output_prefix' => $self->o('chunks_dir').'/'.$self->o('output_prefix'),
'output_suffix' => $self->o('output_suffix'),
} ],
-flow_into => {
2 => [ 'align' ], # will create a fan of jobs
},
},
{ -logic_name => 'align',
-module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy',
},
];
}
1;
=pod
=head1 NAME
Bio::EnsEMBL::Hive::RunnableDB::FastaFactory
=head1 SYNOPSIS
standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FastaFactory --inputfile reference.fasta --max_chunk_length 600000
standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FastaFactory \
--inputfile reference.fasta \
--max_chunk_length 700000 \
--output_prefix ref_chunk \
--flow_into "{ 2 => ['mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1/lg4_split_fasta/analysis?logic_name=blast']}"
=head1 DESCRIPTION
This is a Bioinformatics-specific "Factory" Runnable that splits a given Fasta file into smaller chunks
and dataflows one job per chunk.
The following parameters are supported:
param('inputfile'); # The original Fasta file: 'inputfile' => 'my_sequences.fasta'
param('max_chunk_length'); # Maximum total length of sequences in a chunk: 'max_chunk_length' => '200000'
param('output_prefix'); # A common prefix for output files: 'output_prefix' => 'my_special_chunk_'
param('output_suffix'); # A common suffix for output files: 'output_suffix' => '.nt'
=head1 CONTACT
Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.
=cut
package Bio::EnsEMBL::Hive::RunnableDB::FastaFactory;
use strict;
use base ('Bio::EnsEMBL::Hive::Process');
use Bio::SeqIO;
=head2 param_defaults
Description : Implements param_defaults() interface method of Bio::EnsEMBL::Hive::Process that defines module defaults for parameters.
=cut
sub param_defaults {
return {
'max_chunk_length' => 100000,
'output_prefix' => 'my_chunk_',
'output_suffix' => '.fasta',
};
}
=head2 fetch_input
Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
Here we only check the existence of 'inputfile' parameter and try to parse it (all other parameters have defaults).
=cut
sub fetch_input {
my $self = shift @_;
my $inputfile = $self->param('inputfile') || die "'inputfile' is an obligatory parameter";
my $input_seqio = Bio::SeqIO->new(-file => '<'.$inputfile) || die "Could not open or parse '$inputfile', please investigate";
$self->param('input_seqio', $input_seqio);
}
=head2 run
Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
Because we want to stream the data more efficiently, all functionality is in write_output();
=cut
sub run {
}
=head2 write_output
Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
The main bulk of this Runnable's functionality is here.
Iterates through all sequences in input_seqio, splits them into separate files ("chunks") using a cut-off length and dataflows one job per chunk.
=cut
sub write_output {
my $self = shift @_;
my $input_seqio = $self->param('input_seqio');
my $max_chunk_length = $self->param('max_chunk_length');
my $output_prefix = $self->param('output_prefix');
my $output_suffix = $self->param('output_suffix');
my $chunk_number = 1; # counts the chunks
my $chunk_length = 0; # total length of the current chunk
my $chunk_size = 0; # number of sequences in the current chunk
my $chunk_name = $output_prefix.$chunk_number.$output_suffix;
my $chunk_seqio = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => 'fasta');
while (my $seq_object = $input_seqio->next_seq) {
if((my $seq_length = $seq_object->length()) + $chunk_length <= $max_chunk_length) {
# add to the current chunk:
$chunk_seqio->write_seq( $seq_object );
$chunk_length += $seq_length;
$chunk_size += 1;
} else {
# dataflow the current chunk:
$self->dataflow_output_id( {
'chunk_name' => $chunk_name,
'chunk_number' => $chunk_number,
'chunk_length' => $chunk_length,
'chunk_size' => $chunk_size
}, 2);
# start writing to the next one:
$chunk_length = 0;
$chunk_size = 0;
$chunk_number++;
$chunk_name = $output_prefix.$chunk_number.$output_suffix;
$chunk_seqio = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => 'fasta');
}
}
if($chunk_size) { # flush the last chunk:
$self->dataflow_output_id( {
'chunk_name' => $chunk_name,
'chunk_number' => $chunk_number,
'chunk_length' => $chunk_length,
'chunk_size' => $chunk_size
}, 2);
}
}
1;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment