TableDumperZipper_conf.pm 6.17 KB
Newer Older
1 2 3 4 5

=pod 

=head1 NAME

Leo Gordon's avatar
Leo Gordon committed
6
  Bio::EnsEMBL::Hive::PipeConfig::TableDumperZipper_conf
7 8 9

=head1 SYNOPSIS

Leo Gordon's avatar
Leo Gordon committed
10
    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::TableDumperZipper_conf -password <your_password> -source_dbname ncbi_taxonomy
11

Leo Gordon's avatar
Leo Gordon committed
12
    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::TableDumperZipper_conf -password <your_password> -source_dbname avilella_compara_homology_58 -only_tables 'protein_tree%' -with_schema 0
13 14 15 16 17 18 19

=head1 DESCRIPTION  

    This is an example pipeline put together from basic building blocks:

    Analysis_1: JobFactory.pm is used to turn the list of tables of the given database into jobs

Leo Gordon's avatar
Leo Gordon committed
20
        these jobs are sent down the branch #2 into the second analysis
21 22 23 24 25 26 27 28 29

    Analysis_2: SystemCmd.pm is used to run these dumping+compression jobs in parallel.

=head1 CONTACT

  Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.

=cut

Leo Gordon's avatar
Leo Gordon committed
30
package Bio::EnsEMBL::Hive::PipeConfig::TableDumperZipper_conf;
31 32 33

use strict;
use warnings;
Leo Gordon's avatar
Leo Gordon committed
34

35 36
use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf');  # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly

Leo Gordon's avatar
Leo Gordon committed
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
=head2 default_options

    Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options.
                  In addition to the standard things it defines four options:
                    o('with_schema')        controls whether the table definition will be dumped together with each table's data
                    o('only_tables')        defines the mysql 'LIKE' pattern to select the tables of interest
                    o('target_dir')         defines the directory where the dumped files will be deposited
                    o('dumping_capacity')   defines how many tables can be dumped and zipped in parallel
                
                  There are rules dependent on two options that do not have defaults (this makes them mandatory):
                    o('password')       your read-write password for creation and maintenance of the hive database
                                        (it is assumed to be the same as for the source database, but you can override this assumption)
                    o('source_dbname')  name of the database from which tables are to be dumped

=cut

53 54 55
sub default_options {
    my ($self) = @_;
    return {
56
        %{ $self->SUPER::default_options() },               # inherit other stuff from the base class
57 58 59 60

        'pipeline_name' => 'zip_tables',                    # name used by the beekeeper to prefix job names on the farm

        'source_db' => {
Leo Gordon's avatar
Leo Gordon committed
61
            -host   => 'compara2',
62 63 64 65 66 67
            -port   => 3306,
            -user   => 'ensadmin',
            -pass   => $self->o('password'),
            -dbname => $self->o('source_dbname'),
        },
        
Leo Gordon's avatar
Leo Gordon committed
68 69
        'with_schema'       => 1,                                           # include table creation statement before inserting the data
        'only_tables'       => '%',                                         # use 'protein_tree%' or 'analysis%' to only dump those tables
Leo Gordon's avatar
Leo Gordon committed
70
        'invert_selection'  => 0,                                           # use 'NOT LIKE' instead of 'LIKE'
71
        'target_dir'        => $self->o('ENV', 'HOME').'/'.$self->o('source_dbname'),  # where we want the compressed files to appear
Leo Gordon's avatar
Leo Gordon committed
72
        'dumping_capacity'  => 10,                                          # how many tables can be dumped in parallel
73 74 75
    };
}

Leo Gordon's avatar
Leo Gordon committed
76 77 78 79 80 81 82
=head2 pipeline_create_commands

    Description : Implements pipeline_create_commands() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that lists the commands that will create and set up the Hive database.
                  In addition to the standard creation of the database and populating it with Hive tables and procedures it also creates a directory for storing the output.

=cut

83 84 85 86 87 88 89 90 91
sub pipeline_create_commands {
    my ($self) = @_;
    return [
        @{$self->SUPER::pipeline_create_commands},  # inheriting database and hive tables' creation

        'mkdir -p '.$self->o('target_dir'),
    ];
}

Leo Gordon's avatar
Leo Gordon committed
92 93 94 95 96 97 98 99 100 101 102 103
=head2 pipeline_analyses

    Description : Implements pipeline_analyses() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that defines the structure of the pipeline: analyses, jobs, rules, etc.
                  Here it defines two analyses:

                    * 'get_tables'  generates a list of tables whose names match the pattern o('only_tables')
                      Each job of this analysis will dataflow (create jobs) via branch #2 into 'dumper_zipper' analysis.

                    * 'dumper_zipper'   actually does the dumping of table data (possibly with table definition) and zips the stream into an archive file.

=cut

104 105 106 107 108 109 110
sub pipeline_analyses {
    my ($self) = @_;
    return [
        {   -logic_name => 'get_tables',
            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::JobFactory',
            -parameters => {
                'db_conn'    => $self->o('source_db'),
Leo Gordon's avatar
Leo Gordon committed
111 112
                'inputquery' => 'SELECT table_name FROM information_schema.tables WHERE table_schema = "'.$self->o('source_dbname').'" AND table_name '
                    .($self->o('invert_selection')?'NOT LIKE':'LIKE').' "'.$self->o('only_tables').'"',
113 114
            },
            -input_ids => [
115
                { },    # the template is now implicitly defined by column_names of the query
116 117 118 119 120 121 122 123 124 125 126 127
            ],
            -flow_into => {
                2 => [ 'dumper_zipper' ],   # will create a fan of jobs
            },
        },

        {   -logic_name    => 'dumper_zipper',
            -module        => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
            -parameters    => {
                'target_dir' => $self->o('target_dir'),
                'cmd'        => 'mysqldump '.$self->dbconn_2_mysql('source_db', 0).' '.$self->o('source_db','-dbname').($self->o('with_schema')?'':' -t').' #table_name# | gzip >#target_dir#/#table_name#.sql.gz',
            },
Leo Gordon's avatar
Leo Gordon committed
128
            -hive_capacity => $self->o('dumping_capacity'),       # allow several workers to perform identical tasks in parallel
129 130 131 132 133 134 135 136 137
            -input_ids     => [
                # (jobs for this analysis will be flown_into via branch-2 from 'get_tables' jobs above)
            ],
        },
    ];
}

1;