usebase('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');# All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly
sub default_options{
my($self)=@_;
return{
%{$self->SUPER::default_options()},# inherit other stuff from the base class
'pipeline_name'=>'split_fasta',# name used by the beekeeper to prefix job names on the farm
# runnable-specific parameters' defaults:
'max_chunk_length'=>500000,
'output_prefix'=>'chunk_number_',
'output_suffix'=>'.fasta',
'chunks_dir'=>'fasta_split_chunks',
};
}
sub pipeline_create_commands{
my($self)=@_;
return[
@{$self->SUPER::pipeline_create_commands},# inheriting database and hive tables' creation
Description : Implements param_defaults() interface method of Bio::EnsEMBL::Hive::Process that defines module defaults for parameters.
=cut
sub param_defaults{
return{
'max_chunk_length'=>100000,
'output_prefix'=>'my_chunk_',
'output_suffix'=>'.fasta',
};
}
=head2 fetch_input
Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
Here we only check the existence of 'inputfile' parameter and try to parse it (all other parameters have defaults).
=cut
sub fetch_input{
my$self=shift@_;
my$inputfile=$self->param('inputfile')||die"'inputfile' is an obligatory parameter";
my$input_seqio=Bio::SeqIO->new(-file=>'<'.$inputfile)||die"Could not open or parse '$inputfile', please investigate";
$self->param('input_seqio',$input_seqio);
}
=head2 run
Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
Because we want to stream the data more efficiently, all functionality is in write_output();
=cut
sub run{
}
=head2 write_output
Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
The main bulk of this Runnable's functionality is here.
Iterates through all sequences in input_seqio, splits them into separate files ("chunks") using a cut-off length and dataflows one job per chunk.