CompressFiles_conf.pm 3.21 KB
Newer Older
1 2 3 4
=pod

=head1 NAME

5
    Bio::EnsEMBL::Hive::PipeConfig::CompressFiles_conf
6 7 8 9 10

=head1 SYNOPSIS

    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::CompressFiles_conf -password <your_password>

11
    seed_pipeline.pl -url <url> -logic_name find_files -input_id "{ 'directory' => 'dumps' }"
12 13 14 15 16 17 18 19 20 21 22

=head1 DESCRIPTION

    This is an example pipeline put together from two basic building blocks:

    Analysis_1: JobFactory.pm is used to turn the list of files in a given directory into jobs

        these jobs are sent down the branch #2 into the second analysis

    Analysis_2: SystemCmd.pm is used to run these compression/decompression jobs in parallel.

23 24
=head1 LICENSE

25
    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Brandon Walts's avatar
Brandon Walts committed
26
    Copyright [2016-2020] EMBL-European Bioinformatics Institute
27 28 29 30 31 32 33 34 35 36

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.

37 38
=head1 CONTACT

39
    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
40 41 42 43 44 45 46 47 48 49 50

=cut


package Bio::EnsEMBL::Hive::PipeConfig::CompressFiles_conf;

use strict;
use warnings;

use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly

51

52 53 54 55 56 57 58 59
=head2 pipeline_analyses

    Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
                  Here it defines two analyses:

                    * 'find_files'          generates a list of files whose names match the pattern #only_files#
                                            Each job of this analysis will dataflow (create jobs) via branch #2 into 'compress_a_file' analysis.

60
                    * 'compress_a_file'     actually performs the (un)gzipping of the files in parallel
61 62 63 64 65 66 67 68 69

=cut

sub pipeline_analyses {
    my ($self) = @_;
    return [
        {   -logic_name => 'find_files',
            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
            -parameters => {
70
                'inputcmd'     => 'find #directory# -type f',
71 72 73
                'column_names' => [ 'filename' ],
            },
            -flow_into => {
74
                2 => [ 'compress_a_file' ],     # will create a fan of jobs
75 76 77 78 79 80
            },
        },

        {   -logic_name    => 'compress_a_file',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
            -parameters    => {
81
                'cmd'       => 'gzip #filename#',
82 83 84 85 86 87 88 89
            },
            -analysis_capacity => 4,            # limit the number of workers that will be performing jobs in parallel
        },
    ];
}

1;