HiveGeneric_conf.pm 33.4 KB
Newer Older
Leo Gordon's avatar
Leo Gordon committed
1 2 3 4
=pod 

=head1 NAME

5
    Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf
Leo Gordon's avatar
Leo Gordon committed
6 7 8

=head1 SYNOPSIS

9 10
        # Example 1: specifying only the mandatory option:
    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf -password <mypass>
Leo Gordon's avatar
Leo Gordon committed
11

12 13
        # Example 2: specifying the mandatory options as well as overriding some defaults:
    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf -host <myhost> -dbname <mydbname> -password <mypass>
Leo Gordon's avatar
Leo Gordon committed
14 15 16

=head1 DESCRIPTION

17 18 19 20 21 22 23 24 25 26 27 28
    Generic configuration module for all Hive pipelines with loader functionality.
    All other Hive PipeConfig modules should inherit from this module and will probably need to redefine some or all of the following interface methods:

        * default_options:                  returns a hash of (possibly multilevel) defaults for the options on which depend the rest of the configuration

        * pipeline_create_commands:         returns a list of strings that will be executed as system commands needed to create and set up the pipeline database

        * pipeline_wide_parameters:         returns a hash of pipeline-wide parameter names and their values

        * resource_classes:                 returns a hash of resource class definitions

        * pipeline_analyses:                returns a list of hash structures that define analysis objects bundled with definitions of corresponding jobs, rules and resources
Leo Gordon's avatar
Leo Gordon committed
29

30
        * beekeeper_extra_cmdline_options   returns a string with command line options that you want to be passed to the beekeeper.pl
Leo Gordon's avatar
Leo Gordon committed
31

32 33 34
    When defining anything except the keys of default_options() a call to $self->o('myoption') can be used.
    This call means "substitute this call for the value of 'myoption' at the time of configuring the pipeline".
    All option names mentioned in $self->o() calls within the five interface methods above can be given non-default values from the command line.
Leo Gordon's avatar
Leo Gordon committed
35

36
    Please make sure you have studied the pipeline configuraton examples in Bio::EnsEMBL::Hive::PipeConfig before creating your own PipeConfig modules.
Leo Gordon's avatar
Leo Gordon committed
37

38
=head1 LICENSE
Leo Gordon's avatar
Leo Gordon committed
39

40
    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Matthieu Muffato's avatar
Matthieu Muffato committed
41
    Copyright [2016-2018] EMBL-European Bioinformatics Institute
42

43 44
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
Leo Gordon's avatar
Leo Gordon committed
45

46
         http://www.apache.org/licenses/LICENSE-2.0
Leo Gordon's avatar
Leo Gordon committed
47

48 49 50
    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.
Leo Gordon's avatar
Leo Gordon committed
51 52 53

=head1 CONTACT

54
    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
Leo Gordon's avatar
Leo Gordon committed
55 56 57

=cut

58 59 60 61 62

package Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf;

use strict;
use warnings;
63

64
use Bio::EnsEMBL::ApiVersion ();
65

66
use Bio::EnsEMBL::Hive::Utils::URL;
67
use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
68
use Bio::EnsEMBL::Hive::DBSQL::SqlSchemaAdaptor;
69
use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor;
70
use Bio::EnsEMBL::Hive::Analysis;
71
use Bio::EnsEMBL::Hive::AnalysisStats;
72
use Bio::EnsEMBL::Hive::AnalysisJob;
73
use Bio::EnsEMBL::Hive::Extensions;
74
use Bio::EnsEMBL::Hive::Valley;
75

Leo Gordon's avatar
Leo Gordon committed
76 77
use base ('Bio::EnsEMBL::Hive::DependentOptions');

78

79 80
# ---------------------------[the following methods will be overridden by specific pipelines]-------------------------

81

Leo Gordon's avatar
Leo Gordon committed
82 83 84 85 86 87 88
=head2 default_options

    Description : Interface method that should return a hash of option_name->default_option_value pairs.
                  Please see existing PipeConfig modules for examples.

=cut

89 90 91
sub default_options {
    my ($self) = @_;
    return {
92 93 94 95 96 97
            # Please note: ENVironment variables may be "exported" to inherit from enclosing shell,
            # but if you want to *prevent* that you need to specifically say so
            #  (setting a password to empty string does exactly that - sets it to an empty string)
            #
            #   [bash]      export -n ENSEMBL_CVS_ROOT_DIR  # will stop exporting, but the value in current shell stays as it was
            #   [tcsh]      unsetenv ENSEMBL_CVS_ROOT_DIR   # will destroy the variable even in current shell, and stop exporting
98

99 100
        'ensembl_cvs_root_dir'  => $ENV{'ENSEMBL_CVS_ROOT_DIR'} || $self->o('ensembl_cvs_root_dir'),    # it will make sense to set this variable if you are going to use ehive with ensembl
        'ensembl_release'       => Bio::EnsEMBL::ApiVersion::software_version(),                        # snapshot of EnsEMBL Core API version. Please do not change if not sure.
101

102 103
        'hive_root_dir'         => $ENV{'EHIVE_ROOT_DIR'}                                               # this value is set up automatically if this code is run by init_pipeline.pl
                                    || $self->o('ensembl_cvs_root_dir').'/ensembl-hive',                # otherwise we have to rely on other means
104

105
        'hive_driver'           => 'mysql',
106
        'host'                  => $ENV{'EHIVE_HOST'} || 'localhost',                                   # BEWARE that 'localhost' for mysql driver usually means a UNIX socket, not a TCPIP socket!
107
                                                                                                        # If you need to connect to TCPIP socket, set  -host => '127.0.0.1' instead.
108 109

        'port'                  => $ENV{'EHIVE_PORT'},                                                  # or remain undef, which means default for the driver
110 111 112
        'user'                  => $ENV{'EHIVE_USER'} || 'ensadmin',
        'password'              => $ENV{'EHIVE_PASS'} // $ENV{'ENSADMIN_PSW'} // $self->o('password'),  # people will have to make an effort NOT to insert it into config files like .bashrc etc
        'dbowner'               => $ENV{'EHIVE_USER'} || $ENV{'USER'}         || $self->o('dbowner'),   # although it is very unlikely $ENV{USER} is not set
113
        'pipeline_name'         => $self->pipeline_name(),
114

Leo Gordon's avatar
Leo Gordon committed
115
        'hive_use_triggers'     => 0,                   # there have been a few cases of big pipelines misbehaving with triggers on, let's keep the default off.
116
        'hive_use_param_stack'  => 0,                   # do not reconstruct the calling stack of parameters by default (yet)
117 118
        'hive_force_init'       => 0,                   # setting it to 1 will drop the database prior to creation (use with care!)

119
        'pipeline_db'   => {
120
            -driver => $self->o('hive_driver'),
121
            -host   => $self->o('host'),
122 123
            -port   => $self->o('port'),
            -user   => $self->o('user'),
124
            -pass   => $self->o('password'),
125
            -dbname => $self->o('dbowner').'_'.$self->o('pipeline_name'),  # example of a linked definition (resolved via saturation)
126 127 128 129
        },
    };
}

130

Leo Gordon's avatar
Leo Gordon committed
131 132 133 134 135 136 137
=head2 pipeline_create_commands

    Description : Interface method that should return a list of command lines to be run in order to create and set up the pipeline database.
                  Please see existing PipeConfig modules for examples.

=cut

138
sub pipeline_create_commands {
139
    my $self    = shift @_;
140

141 142 143 144
    my $pipeline_url    = $self->pipeline_url();
    my $parsed_url      = Bio::EnsEMBL::Hive::Utils::URL::parse( $pipeline_url );
    my $driver          = $parsed_url ? $parsed_url->{'driver'} : '';

145
    return [
146 147
            $self->o('hive_force_init') ? $self->db_cmd('DROP DATABASE IF EXISTS') : (),
            $self->db_cmd('CREATE DATABASE'),
148

149
                # we got table definitions for all drivers:
150
            $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/tables.'.$driver,
151

152
                # auto-sync'ing triggers are off by default and not yet available in pgsql:
153
            $self->o('hive_use_triggers') && ($driver ne 'pgsql')  ? ( $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/triggers.'.$driver ) : (),
154 155

                # FOREIGN KEY constraints cannot be defined in sqlite separately from table definitions, so they are off there:
156
                                             ($driver ne 'sqlite') ? ( $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/foreign_keys.sql' ) : (),
157 158

                # we got procedure definitions for all drivers:
159
            $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/procedures.'.$driver,
160
    ];
161 162
}

163

Leo Gordon's avatar
Leo Gordon committed
164 165 166 167 168 169 170 171
=head2 pipeline_wide_parameters

    Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs.
                  The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically).
                  Please see existing PipeConfig modules for examples.

=cut

172 173 174
sub pipeline_wide_parameters {
    my ($self) = @_;
    return {
175
        'schema_version' => $self->o('ensembl_release'),    # keep compatibility with core API
176 177 178
    };
}

179

Leo Gordon's avatar
Leo Gordon committed
180 181 182 183 184 185 186
=head2 resource_classes

    Description : Interface method that should return a hash of resource_description_id->resource_description_hash.
                  Please see existing PipeConfig modules for examples.

=cut

187 188 189
sub resource_classes {
    my ($self) = @_;
    return {
190 191 192 193 194 195
## Old style:
#        1 => { -desc => 'default',  'LSF' => '' },
#        2 => { -desc => 'urgent',   'LSF' => '-q yesterday' },
## New style:
        'default' => { 'LSF' => '' },
        'urgent'  => { 'LSF' => '-q yesterday' },
196 197 198
    };
}

199

Leo Gordon's avatar
Leo Gordon committed
200 201 202 203 204 205 206
=head2 pipeline_analyses

    Description : Interface method that should return a list of hashes that define analysis bundled with corresponding jobs, dataflow and analysis_ctrl rules and resource_id.
                  Please see existing PipeConfig modules for examples.

=cut

207 208 209 210 211 212 213
sub pipeline_analyses {
    my ($self) = @_;
    return [
    ];
}


214 215 216 217 218 219 220 221 222 223 224 225 226
=head2 beekeeper_extra_cmdline_options

    Description : Interface method that should return a string with extra parameters that you want to be passed to beekeeper.pl

=cut

sub beekeeper_extra_cmdline_options {
    my ($self) = @_;

    return '';
}


227 228 229
# ---------------------------------[now comes the interfacing stuff - feel free to call but not to modify]--------------------


230 231 232 233
sub hive_meta_table {
    my ($self) = @_;

    return {
234 235
        'hive_sql_schema_version'   => Bio::EnsEMBL::Hive::DBSQL::SqlSchemaAdaptor->get_code_sql_schema_version(),
        'hive_pipeline_name'        => $self->o('pipeline_name'),
236
        'hive_use_param_stack'      => $self->o('hive_use_param_stack'),
237 238 239
    };
}

Leo Gordon's avatar
Leo Gordon committed
240
sub pre_options {
241 242
    my $self = shift @_;

Leo Gordon's avatar
Leo Gordon committed
243 244 245 246
    return {
        'help!' => '',
        'job_topup!' => '',
        'analysis_topup!' => '',
247
        'pipeline_url' => '',
248
#        'hive_use_triggers' => '',
Leo Gordon's avatar
Leo Gordon committed
249
    };
250 251
}

252

Leo Gordon's avatar
Leo Gordon committed
253 254
=head2 dbconn_2_mysql

255
    Description : Deprecated method. Please use $self->db_cmd() instead.
Leo Gordon's avatar
Leo Gordon committed
256 257 258

=cut

259 260 261
sub dbconn_2_mysql {    # will save you a lot of typing
    my ($self, $db_conn, $with_db) = @_;

262
    warn "\nDEPRECATED: dbconn_2_mysql() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
263 264 265

    my $port = $self->o($db_conn,'-port');

266
    return '--host='.$self->o($db_conn,'-host').' '
267
          .($port ? '--port='.$self->o($db_conn,'-port').' ' : '')
268
          .'--user="'.$self->o($db_conn,'-user').'" '
269
          .'--password="'.$self->o($db_conn,'-pass').'" '
270
          .($with_db ? ($self->o($db_conn,'-dbname').' ') : '');
271 272
}

273

274 275
=head2 dbconn_2_pgsql

276
    Description : Deprecated method. Please use $self->db_cmd() instead.
277 278 279 280 281 282

=cut

sub dbconn_2_pgsql {    # will save you a lot of typing
    my ($self, $db_conn, $with_db) = @_;

283
    warn "\nDEPRECATED: dbconn_2_pgsql() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
284 285 286

    my $port = $self->o($db_conn,'-port');

287
    return '--host='.$self->o($db_conn,'-host').' '
288
          .($port ? '--port='.$self->o($db_conn,'-port').' ' : '')
289 290 291 292
          .'--username="'.$self->o($db_conn,'-user').'" '
          .($with_db ? ($self->o($db_conn,'-dbname').' ') : '');
}

293 294
=head2 db_connect_command

295
    Description : Deprecated method. Please use $self->db_cmd() instead.
296 297 298 299 300 301

=cut

sub db_connect_command {
    my ($self, $db_conn) = @_;

302
    warn "\nDEPRECATED: db_connect_command() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
303

304 305
    my $driver = $self->o($db_conn, '-driver');

306 307 308
    return {
        'sqlite'    => 'sqlite3 '.$self->o($db_conn, '-dbname'),
        'mysql'     => 'mysql '.$self->dbconn_2_mysql($db_conn, 1),
309 310
        'pgsql'     => "env PGPASSWORD='".$self->o($db_conn,'-pass')."' psql ".$self->dbconn_2_pgsql($db_conn, 1),
    }->{ $driver };
311 312 313 314 315
}


=head2 db_execute_command

316
    Description : Deprecated method. Please use $self->db_cmd() instead.
317 318 319 320

=cut

sub db_execute_command {
321
    my ($self, $db_conn, $sql_command, $with_db) = @_;
322

323
    warn "\nDEPRECATED: db_execute_command() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
324

325 326
    $with_db = 1 unless(defined($with_db));

327 328 329 330 331 332 333 334 335 336 337 338
    my $driver = $self->o($db_conn, '-driver');

    if(($driver eq 'sqlite') && !$with_db) {    # in these special cases we pretend sqlite can understand these commands
        return "rm -f $1" if($sql_command=~/DROP\s+DATABASE\s+(?:IF\s+EXISTS\s+)?(\w+)/);
        return "touch $1" if($sql_command=~/CREATE\s+DATABASE\s+(\w+)/);
    } else {
        return {
            'sqlite'    => 'sqlite3 '.$self->o($db_conn, '-dbname')." '$sql_command'",
            'mysql'     => 'mysql '.$self->dbconn_2_mysql($db_conn, $with_db)." -e '$sql_command'",
            'pgsql'     => "env PGPASSWORD='".$self->o($db_conn,'-pass')."' psql --command='$sql_command' ".$self->dbconn_2_pgsql($db_conn, $with_db),
        }->{ $driver };
    }
339 340 341
}


Leo Gordon's avatar
Leo Gordon committed
342 343
=head2 dbconn_2_url

344
    Description :  A convenience method used to stringify a connection-parameters hash into a 'pipeline_url' that beekeeper.pl will undestand
Leo Gordon's avatar
Leo Gordon committed
345 346 347

=cut

348
sub dbconn_2_url {
349 350 351
    my ($self, $db_conn, $with_db) = @_;

    $with_db = 1 unless(defined($with_db));
352

353
    my $driver = $self->o($db_conn, '-driver');
354
    my $port   = $self->o($db_conn,'-port');
355

356 357
    return (    ($driver eq 'sqlite')
            ? $driver.':///'
358
            : $driver.'://'.$self->o($db_conn,'-user').':'.$self->o($db_conn,'-pass').'@'.$self->o($db_conn,'-host').($port ? ':'.$port : '').'/'
359
           ) . ($with_db ? $self->o($db_conn,'-dbname') : '');
360 361
}

362

Leo Gordon's avatar
Leo Gordon committed
363 364 365
sub pipeline_url {
    my $self = shift @_;

366
    return $self->root()->{'pipeline_url'} || $self->dbconn_2_url('pipeline_db', 1); # used to force vivification of the whole 'pipeline_db' structure (used in run() )
Leo Gordon's avatar
Leo Gordon committed
367
}
368

369

370 371 372 373 374 375 376 377 378 379 380 381
=head2 db_cmd

    Description :  Returns a db_cmd.pl-based command line that should execute by any supported driver (mysql/pgsql/sqlite)

=cut

sub db_cmd {
    my ($self, $sql_command, $db_url) = @_;

    $db_url //= $self->pipeline_url();
    my $db_cmd_path = $self->o('hive_root_dir').'/scripts/db_cmd.pl';

382
    return "$db_cmd_path -url '$db_url'".($sql_command ? " -sql '$sql_command'" : '');
383 384 385
}


386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
sub pipeline_name {
    my $self            = shift @_;
    my $pipeline_name   = shift @_;

    unless($pipeline_name) {    # or turn the ClassName into pipeline_name:
        $pipeline_name = ref($self);        # get the original class name
        $pipeline_name=~s/^.*:://;          # trim the leading classpath prefix
        $pipeline_name=~s/_conf$//;         # trim the optional _conf from the end
    }

    $pipeline_name=~s/([[:lower:]])([[:upper:]])/${1}_${2}/g;   # CamelCase into Camel_Case

    return lc($pipeline_name);
}


Leo Gordon's avatar
Leo Gordon committed
402 403 404 405 406 407 408 409 410 411 412 413 414
=head2 process_options

    Description : The method that does all the parameter parsing magic.
                  It is two-pass through the interface methods: first pass collects the options, second is intelligent substitution.

    Caller      : init_pipeline.pl or any other script that will drive this module.

    Note        : You can override parsing the command line bit by providing a hash as the argument to this method.
                  This hash should contain definitions of all the parameters you would otherwise be providing from the command line.
                  Useful if you are creating batches of hive pipelines using a script.

=cut

415
sub process_options {
Leo Gordon's avatar
Leo Gordon committed
416
    my $self = shift @_;
417

Leo Gordon's avatar
Leo Gordon committed
418 419
        # pre-patch definitely_used_options:
    $self->{'_extra_options'} = $self->load_cmdline_options( $self->pre_options() );
420
    $self->root()->{'pipeline_url'} = $self->{'_extra_options'}{'pipeline_url'};
421

422
    $self->use_cases( [ 'pipeline_create_commands', 'pipeline_wide_parameters', 'resource_classes', 'pipeline_analyses', 'beekeeper_extra_cmdline_options', 'pipeline_url', 'hive_meta_table' ] );
Leo Gordon's avatar
Leo Gordon committed
423
    return $self->SUPER::process_options();
424 425
}

Leo Gordon's avatar
Leo Gordon committed
426

Leo Gordon's avatar
Leo Gordon committed
427 428 429 430 431 432 433 434
=head2 run

    Description : The method that uses the Hive/EnsEMBL API to actually create all the analyses, jobs, dataflow and control rules and resource descriptions.

    Caller      : init_pipeline.pl or any other script that will drive this module.

=cut

435
sub run {
436
    my $self  = shift @_;
437 438 439
    my $analysis_topup  = $self->{'_extra_options'}{'analysis_topup'};
    my $job_topup       = $self->{'_extra_options'}{'job_topup'};
    my $pipeline_url    = $self->pipeline_url();
440
    my $pipeline_name   = $self->o('pipeline_name');
441

442
    unless($analysis_topup || $job_topup) {
443 444 445 446 447 448 449 450 451 452
        foreach my $cmd (@{$self->pipeline_create_commands}) {
            warn "Running the command:\n\t$cmd\n";
            if(my $retval = system($cmd)) {
                die "Return value = $retval, possibly an error\n";
            } else {
                warn "Done.\n\n";
            }
        }
    }

453
    Bio::EnsEMBL::Registry->no_version_check(1);
454
    my $hive_dba                     = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new( -url => $pipeline_url, -no_sql_schema_version_check => 1 );
455
    my $resource_class_adaptor       = $hive_dba->get_ResourceClassAdaptor;
456

457
    unless($job_topup) {
458 459 460 461 462 463 464 465
        my $meta_adaptor = $hive_dba->get_MetaAdaptor;      # the new adaptor for 'hive_meta' table
        warn "Loading hive_meta table ...\n";
        my $hive_meta_table = $self->hive_meta_table;
        while( my($meta_key, $meta_value) = each %$hive_meta_table ) {
            $meta_adaptor->store_pair( $meta_key, $meta_value );
        }

        my $meta_container = $hive_dba->get_MetaContainer;  # adaptor over core's 'meta' table for compatibility with core API
466 467 468 469 470
        warn "Loading pipeline-wide parameters ...\n";

        my $pipeline_wide_parameters = $self->pipeline_wide_parameters;
        while( my($meta_key, $meta_value) = each %$pipeline_wide_parameters ) {
            if($analysis_topup) {
471
                $meta_container->remove_all_by_meta_key($meta_key);
472
            }
473
            $meta_container->store_pair($meta_key, $meta_value);
474
        }
475 476
        warn "Done.\n\n";

477 478 479
            # pre-load resource_class and resource_description tables:
        my $resource_description_adaptor    = $hive_dba->get_ResourceDescriptionAdaptor;
        warn "Loading the Resources ...\n";
480

481 482
        my $resource_classes_hash = $self->resource_classes;
        my @resource_classes_order = sort { ($b eq 'default') or -($a eq 'default') or ($a cmp $b) } keys %$resource_classes_hash; # put 'default' to the front
483
        my %seen_resource_name = ();
484 485
        foreach my $rc_id (@resource_classes_order) {
            my $mt2param = $resource_classes_hash->{$rc_id};
486

487
            my $rc_name = delete $mt2param->{-desc};
488
            if($rc_id!~/^\d+$/) {
489
                $rc_name  = $rc_id;
490
                $rc_id = undef;
491 492
            }

493
            if(!$rc_name or $seen_resource_name{lc($rc_name)}++) {
494 495 496
                die "Every resource has to have a unique description, please fix the PipeConfig file";
            }

497
            my ($rc, $rc_newly_created) = $resource_class_adaptor->create_new(
498
                defined($rc_id) ? (-DBID   => $rc_id) : (),
499
                -NAME   => $rc_name,
500
                1   # check whether this ResourceClass was already present in the database
501
            );
502
            $rc_id = $rc->dbID();
503

504 505 506 507 508
            if($rc_newly_created) {
                warn "Creating resource_class $rc_name($rc_id).\n";
            } else {
                warn "Attempt to re-create and potentially redefine resource_class $rc_name($rc_id). NB: This may affect already created analyses!\n";
            }
509

510 511 512
            while( my($meadow_type, $resource_param_list) = each %$mt2param ) {
                $resource_param_list = [ $resource_param_list ] unless(ref($resource_param_list));  # expecting either a scalar or a 2-element array

513
                $resource_description_adaptor->create_new(
514 515 516 517
                    -resource_class_id      => $rc_id,
                    -meadow_type            => $meadow_type,
                    -submission_cmd_args    => $resource_param_list->[0],
                    -worker_cmd_args        => $resource_param_list->[1],
518 519
                );
            }
520
        }
521 522
        unless(my $default_rc = $resource_class_adaptor->fetch_by_name('default')) {
            warn "\tNB:'default' resource class is not in the database (did you forget to inherit from SUPER::resource_classes ?) - creating it for you\n";
523 524
            $resource_class_adaptor->create_new(-NAME => 'default');
        }
525
        warn "Done.\n\n";
526 527
    }

528 529 530
    my $analysis_adaptor            = $hive_dba->get_AnalysisAdaptor;
    my $analysis_stats_adaptor      = $hive_dba->get_AnalysisStatsAdaptor;
    my $job_adaptor                 = $hive_dba->get_AnalysisJobAdaptor;
531

532 533
    my $valley = Bio::EnsEMBL::Hive::Valley->new( {}, 'LOCAL' );

534 535
    my %seen_logic_name = ();

536
    foreach my $aha (@{$self->pipeline_analyses}) {
537
        my %aha_copy = %$aha;
538
        my ($logic_name, $module, $parameters_hash, $input_ids, $blocked, $batch_size, $hive_capacity, $failed_job_tolerance,
539 540 541 542 543 544 545 546
                $max_retry_count, $can_be_empty, $rc_id, $rc_name, $priority, $meadow_type, $analysis_capacity, $wait_for, $flow_into)
         = delete @aha_copy{qw(-logic_name -module -parameters -input_ids -blocked -batch_size -hive_capacity -failed_job_tolerance
                 -max_retry_count -can_be_empty -rc_id -rc_name -priority -meadow_type -analysis_capacity -wait_for -flow_into)};   # slicing a hash reference

         my @unparsed_attribs = keys %aha_copy;
         if(@unparsed_attribs) {
             die "Could not parse the following analysis attributes: ".join(', ',@unparsed_attribs);
         }
547

548
        unless($logic_name) {
549
            die "'-logic_name' must be defined in every analysis";
550 551
        }

552
        if($seen_logic_name{$logic_name}++) {
553
            die "an entry with -logic_name '$logic_name' appears at least twice in the same configuration file, probably a typo";
554
        }
555

556 557
        my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name);
        if( $analysis ) {
558

559 560 561 562
            if($analysis_topup) {
                warn "Skipping creation of already existing analysis '$logic_name'.\n";
                next;
            }
563 564 565

        } else {

566 567 568 569 570
            if($job_topup) {
                die "Could not fetch analysis '$logic_name'";
            }

            warn "Creating analysis '$logic_name'.\n";
571

572 573
            if($rc_id) {
                warn "(-rc_id => $rc_id) syntax is deprecated, please start using (-rc_name => 'your_resource_class_name')";
574 575 576
            } else {
                $rc_name ||= 'default';
                my $rc = $resource_class_adaptor->fetch_by_name($rc_name ) or die "Could not fetch resource with name '$rc_name', please check that resource_classes() method of your PipeConfig either contain it or inherit from the parent class";
577 578 579
                $rc_id = $rc->dbID();
            }

580 581 582 583
            if ($meadow_type and not exists $valley->available_meadow_hash()->{$meadow_type}) {
                die "The meadow '$meadow_type' is currently not registered (analysis '$logic_name')\n";
            }

584 585 586
            $parameters_hash ||= {};    # in case nothing was given
            die "'-parameters' has to be a hash" unless(ref($parameters_hash) eq 'HASH');

587
            $analysis = Bio::EnsEMBL::Hive::Analysis->new(
588 589
                -logic_name             => $logic_name,
                -module                 => $module,
590
                -parameters             => $parameters_hash,
591 592 593 594 595
                -resource_class_id      => $rc_id,
                -failed_job_tolerance   => $failed_job_tolerance,
                -max_retry_count        => $max_retry_count,
                -can_be_empty           => $can_be_empty,
                -priority               => $priority,
596
                -meadow_type            => $meadow_type,
597
                -analysis_capacity      => $analysis_capacity,
598
            );
599
            $analysis->get_compiled_module_name();  # check if it compiles and is named correctly
600 601
            $analysis_adaptor->store($analysis);

602 603 604 605
            my $stats = Bio::EnsEMBL::Hive::AnalysisStats->new(
                -analysis_id            => $analysis->dbID,
                -batch_size             => $batch_size,
                -hive_capacity          => $hive_capacity,
606
                -status                 => $blocked ? 'BLOCKED' : 'EMPTY',  # be careful, as this "soft" way of blocking may be accidentally unblocked by deep sync
607 608 609 610 611 612 613 614 615 616 617
                -total_job_count        => 0,
                -semaphored_job_count   => 0,
                -ready_job_count        => 0,
                -done_job_count         => 0,
                -failed_job_count       => 0,
                -num_running_workers    => 0,
                -num_required_workers   => 0,
                -behaviour              => 'STATIC',
                -input_capacity         => 4,
                -output_capacity        => 4,
                -sync_lock              => 0,
618 619
            );
            $analysis_stats_adaptor->store($stats);
620
        }
621 622

            # now create the corresponding jobs (if there are any):
623 624 625 626 627 628 629 630
        if($input_ids) {
            my @jobs = map { Bio::EnsEMBL::Hive::AnalysisJob->new(
                -prev_job_id    => undef,           # these jobs are created by the initialization script, not by another job
                -analysis_id    => $analysis->dbID,
                -input_id       => $_,              # input_ids are now centrally stringified in the AnalysisJob itself
            ) } @$input_ids;

            $job_adaptor->store_jobs_and_adjust_counters( \@jobs );
631 632 633
        }
    }

634
    unless($job_topup) {
635

636 637 638 639
            # Now, run separately through the already created analyses and link them together:
            #
        my $ctrl_rule_adaptor            = $hive_dba->get_AnalysisCtrlRuleAdaptor;
        my $dataflow_rule_adaptor        = $hive_dba->get_DataflowRuleAdaptor;
640

641
        foreach my $aha (@{$self->pipeline_analyses}) {
642 643
            my ($logic_name, $wait_for, $flow_into)
                 = @{$aha}{qw(-logic_name -wait_for -flow_into)};   # slicing a hash reference
644

645
            my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name);
646

647 648 649 650
            $wait_for ||= [];
            $wait_for   = [ $wait_for ] unless(ref($wait_for) eq 'ARRAY'); # force scalar into an arrayref

                # create control rules:
651
            foreach my $condition_url (@$wait_for) {
652 653 654
                unless ($condition_url =~ m{^\w*://}) {
                    my $condition_analysis = $analysis_adaptor->fetch_by_logic_name($condition_url);
                    die "Could not fetch analysis '$condition_url' to create a control rule (in '".($analysis->logic_name)."')\n" unless defined $condition_analysis;
655
                }
656 657 658 659 660 661 662
                my $c_rule = Bio::EnsEMBL::Hive::AnalysisCtrlRule->new(
                        -condition_analysis_url => $condition_url,
                        -ctrled_analysis_id     => $analysis->dbID,
                );
                $ctrl_rule_adaptor->store( $c_rule, 1 );

                warn $c_rule->toString."\n";
663 664
            }

665 666
            $flow_into ||= {};
            $flow_into   = { 1 => $flow_into } unless(ref($flow_into) eq 'HASH'); # force non-hash into a hash
667

668
            my %group_tag_to_funnel_dataflow_rule_id = ();
669

670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
            my $semaphore_sign = '->';

            my @all_branch_tags = keys %$flow_into;
            foreach my $branch_tag ((grep {/^[A-Z]$semaphore_sign/} @all_branch_tags), (grep {/$semaphore_sign[A-Z]$/} @all_branch_tags), (grep {!/$semaphore_sign/} @all_branch_tags)) {

                my ($branch_name_or_code, $group_role, $group_tag);

                if($branch_tag=~/^([A-Z])$semaphore_sign(-?\w+)$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($2, 'funnel', $1);
                } elsif($branch_tag=~/^(-?\w+)$semaphore_sign([A-Z])$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($1, 'fan', $2);
                } elsif($branch_tag=~/^(-?\w+)$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($1, '');
                } elsif($branch_tag=~/:/) {
                    die "Please use newer '2${semaphore_sign}A' and 'A${semaphore_sign}1' notation instead of '2:1' and '1'\n";
                } else {
                    die "Error parsing the group tag '$branch_tag'\n";
                }
688

689 690 691 692 693 694 695 696 697 698
                my $funnel_dataflow_rule_id = undef;    # NULL by default

                if($group_role eq 'fan') {
                    unless($funnel_dataflow_rule_id = $group_tag_to_funnel_dataflow_rule_id{$group_tag}) {
                        die "No funnel dataflow_rule defined for group '$group_tag'\n";
                    }
                }

                my $heirs = $flow_into->{$branch_tag};
                $heirs = [ $heirs ] unless(ref($heirs)); # force scalar into an arrayref first
699
                $heirs = { map { ($_ => undef) } @$heirs } if(ref($heirs) eq 'ARRAY'); # now force it into a hash if it wasn't
700

701
                while(my ($heir_url, $input_id_template_list) = each %$heirs) {
702 703 704 705 706

                    unless ($heir_url =~ m{^\w*://}) {
                        my $heir_analysis = $analysis_adaptor->fetch_by_logic_name($heir_url);
                        die "No analysis named '$heir_url' (dataflow from analysis '".($analysis->logic_name)."')\n" unless defined $heir_analysis;
                    }
707 708
                    
                    $input_id_template_list = [ $input_id_template_list ] unless(ref($input_id_template_list) eq 'ARRAY');  # allow for more than one template per analysis
709

710
                    foreach my $input_id_template (@$input_id_template_list) {
711

712 713 714
                        my $df_rule = Bio::EnsEMBL::Hive::DataflowRule->new(
                            -from_analysis              => $analysis,
                            -to_analysis_url            => $heir_url,
715
                            -branch_code                => $branch_name_or_code,
716 717 718 719 720 721
                            -input_id_template          => $input_id_template,
                            -funnel_dataflow_rule_id    => $funnel_dataflow_rule_id,
                        );
                        $dataflow_rule_adaptor->store( $df_rule, 1 );

                        warn $df_rule->toString."\n";
722 723 724 725 726

                        if($group_role eq 'funnel') {
                            if($group_tag_to_funnel_dataflow_rule_id{$group_tag}) {
                                die "More than one funnel dataflow_rule defined for group '$group_tag'\n";
                            } else {
727
                                $group_tag_to_funnel_dataflow_rule_id{$group_tag} = $df_rule->dbID();
728 729
                            }
                        }
730 731 732
                    } # /for all templates
                } # /for all heirs
            } # /for all branch_tags
733 734 735
        }
    }

736 737 738
    print "\n\n# --------------------[Useful commands]--------------------------\n";
    print "\n";
    print " # It is convenient to store the pipeline url in a variable:\n";
739
    print "\texport EHIVE_URL=$pipeline_url\t\t\t# bash version\n";
740
    print "(OR)\n";
741
    print "\tsetenv EHIVE_URL $pipeline_url\t\t\t# [t]csh version\n";
742 743
    print "\n";
    print " # Add a new job to the pipeline (usually done once before running, but pipeline can be \"topped-up\" at any time) :\n";
744
    print "\tseed_pipeline.pl -url $pipeline_url -logic_name <analysis_name> -input_id <param_hash>\n";
745 746
    print "\n";
    print " # Synchronize the Hive (should be done before [re]starting a pipeline) :\n";
747
    print "\tbeekeeper.pl -url $pipeline_url -sync\n";
748 749
    print "\n";
    print " # Run the pipeline (can be interrupted and restarted) :\n";
750
    print "\tbeekeeper.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()." -loop\t\t# run in looped automatic mode (a scheduling step performed every minute)\n";
Leo Gordon's avatar
Leo Gordon committed
751
    print "(OR)\n";
752
    print "\tbeekeeper.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()." -run \t\t# run one scheduling step of the pipeline and exit (useful for debugging/learning)\n";
753
    print "(OR)\n";
754
    print "\trunWorker.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()."      \t\t# run exactly one Worker locally (useful for debugging/learning)\n";
755 756
    print "\n";
    print " # At any moment during or after execution you can request a pipeline diagram in an image file (desired format is set via extension) :\n";
757
    print "\tgenerate_graph.pl -url $pipeline_url -out $pipeline_name.png\n";
758
    print "\n";
759 760 761 762 763 764
    print " # If you are running the pipeline on LSF, you can collect actual resource usage statistics :\n";
    print "\tlsf_report.pl -url $pipeline_url\n";
    print "\n";
    print " # After having run lsf_report.pl, you can request a resource usage timeline in an image file (desired format is set via extension) :\n";
    print "\tgenerate_timeline.pl -url $pipeline_url -out timeline_$pipeline_name.png\n";
    print "\n";
765
    print " # Peek into your pipeline database with a database client (useful to have open while the pipeline is running) :\n";
766
    print "\tdb_cmd.pl -url $pipeline_url\n\n";
767 768 769
}

1;
Leo Gordon's avatar
Leo Gordon committed
770