FastaFactory.pm 6.72 KB
Newer Older
1
2
3
4
=pod 

=head1 NAME

5
    Bio::EnsEMBL::Hive::RunnableDB::FastaFactory
6
7
8
9
10
11
12
13
14
15
16
17
18

=head1 SYNOPSIS

    standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FastaFactory --inputfile reference.fasta --max_chunk_length 600000

    standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FastaFactory \
                    --inputfile reference.fasta \
                    --max_chunk_length 700000 \
                    --output_prefix ref_chunk \
                    --flow_into "{ 2 => ['mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1/lg4_split_fasta/analysis?logic_name=blast']}"

=head1 DESCRIPTION

19
    This is a Bioinformatics-specific "Factory" Runnable that splits a given Fasta file into smaller chunks
20
21
22
23
    and dataflows one job per chunk. Note that:
        - the files are created in the current directory.
        - the Runnable does not split the individual sequences, it only groups them in a way that none of the output files will
          be longer than param('max_chunk_length').
24
25
26
27
28
29
30
31
32
33

    The following parameters are supported:

        param('inputfile');         # The original Fasta file: 'inputfile' => 'my_sequences.fasta'

        param('max_chunk_length');  # Maximum total length of sequences in a chunk: 'max_chunk_length' => '200000'

        param('output_prefix');     # A common prefix for output files: 'output_prefix' => 'my_special_chunk_'

        param('output_suffix');     # A common suffix for output files: 'output_suffix' => '.nt'
34

35
36
        param('hash_directories');  # Boolean (default to 0): should the output files be put in different ("hashed") directories

37
=head1 LICENSE
38

39
    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
nwillhoft's avatar
nwillhoft committed
40
    Copyright [2016-2021] EMBL-European Bioinformatics Institute
41

42
43
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
44

45
         http://www.apache.org/licenses/LICENSE-2.0
46

47
48
49
    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.
50
51
52

=head1 CONTACT

53
    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
54
55
56
57
58
59
60

=cut


package Bio::EnsEMBL::Hive::RunnableDB::FastaFactory;

use strict;
61
use warnings;
62
63

use Bio::SeqIO;
64
65
66
67
68
use File::Path;

use Bio::EnsEMBL::Hive::Utils ('dir_revhash');

use base ('Bio::EnsEMBL::Hive::Process');
69
70
71
72
73
74
75
76
77
78
79
80
81
82


=head2 param_defaults

    Description : Implements param_defaults() interface method of Bio::EnsEMBL::Hive::Process that defines module defaults for parameters.

=cut

sub param_defaults {

    return {
        'max_chunk_length'  => 100000,
        'output_prefix'     => 'my_chunk_',
        'output_suffix'     => '.fasta',
83
        'hash_directories'  => 0,
84
85
86
87
88
89
90
91
92
93
94
95
96
97
    };
}


=head2 fetch_input

    Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
                    Here we only check the existence of 'inputfile' parameter and try to parse it (all other parameters have defaults).

=cut

sub fetch_input {
    my $self = shift @_;

98
    my $inputfile   = $self->param_required('inputfile');
99
100
101
102
103
104
    die "Cannot read '$inputfile'" unless(-r $inputfile);

    if($inputfile=~/\.(?:gz|Z)$/) {
        $inputfile = "gunzip -c $inputfile |";
    }
    my $input_seqio = Bio::SeqIO->new(-file => $inputfile)  || die "Could not open or parse '$inputfile', please investigate";
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

    $self->param('input_seqio', $input_seqio);
}


=head2 run

    Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
                    Because we want to stream the data more efficiently, all functionality is in write_output();

=cut

sub run {
}


=head2 write_output

    Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
                    The main bulk of this Runnable's functionality is here.
                    Iterates through all sequences in input_seqio, splits them into separate files ("chunks") using a cut-off length and dataflows one job per chunk.

=cut

sub write_output {
    my $self = shift @_;

    my $input_seqio         = $self->param('input_seqio');
    my $max_chunk_length    = $self->param('max_chunk_length');
    my $output_prefix       = $self->param('output_prefix');
    my $output_suffix       = $self->param('output_suffix');

    my $chunk_number = 1;   # counts the chunks
    my $chunk_length = 0;   # total length of the current chunk
    my $chunk_size   = 0;   # number of sequences in the current chunk
    my $chunk_name   = $output_prefix.$chunk_number.$output_suffix;
141
142
143
144
145
146
147
    if ($self->param('hash_directories')) {
        my $dir_tree = dir_revhash($chunk_number);
        if ($dir_tree ne '') {
            mkpath($dir_tree);
            $chunk_name = $dir_tree.'/'.$chunk_name;
        }
    }
148
    my $chunk_seqio  = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => 'fasta');
149
    
150
    while (my $seq_object = $input_seqio->next_seq) {
151
	$chunk_seqio->write_seq( $seq_object );
152
153
        $chunk_length += $seq_object->length();
        $chunk_size   += 1;
154
	
155
        if ($chunk_length > $max_chunk_length) {
156
157
158
159
160
161
162
163
164
165
166
167
168
169

                # dataflow the current chunk:
            $self->dataflow_output_id( {
                'chunk_name' => $chunk_name,
                'chunk_number' => $chunk_number,
                'chunk_length' => $chunk_length,
                'chunk_size' => $chunk_size
            }, 2);

                # start writing to the next one:
            $chunk_length   = 0;
            $chunk_size     = 0;
            $chunk_number++;
            $chunk_name     = $output_prefix.$chunk_number.$output_suffix;
170
171
172
173
174
175
176
            if ($self->param('hash_directories')) {
                my $dir_tree = dir_revhash($chunk_number);
                if ($dir_tree ne '') {
                    mkpath($dir_tree);
                    $chunk_name = $dir_tree.'/'.$chunk_name;
                }
            }
177
178
179
180
181
182
183
184
185
186
187
188
            $chunk_seqio    = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => 'fasta');
        }
    }

    if($chunk_size) {   # flush the last chunk:

        $self->dataflow_output_id( {
            'chunk_name' => $chunk_name,
            'chunk_number' => $chunk_number,
            'chunk_length' => $chunk_length,
            'chunk_size' => $chunk_size
        }, 2);
189
190
    } else {
	unlink $chunk_name unless (stat($chunk_name))[7];
191
192
193
194
195
    }
}

1;