HiveGeneric_conf.pm 32.3 KB
Newer Older
Leo Gordon's avatar
Leo Gordon committed
1 2 3 4
=pod 

=head1 NAME

5
    Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf
Leo Gordon's avatar
Leo Gordon committed
6 7 8

=head1 SYNOPSIS

9 10
        # Example 1: specifying only the mandatory option:
    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf -password <mypass>
Leo Gordon's avatar
Leo Gordon committed
11

12 13
        # Example 2: specifying the mandatory options as well as overriding some defaults:
    init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf -host <myhost> -dbname <mydbname> -password <mypass>
Leo Gordon's avatar
Leo Gordon committed
14 15 16

=head1 DESCRIPTION

17 18 19 20 21 22 23 24 25 26 27 28
    Generic configuration module for all Hive pipelines with loader functionality.
    All other Hive PipeConfig modules should inherit from this module and will probably need to redefine some or all of the following interface methods:

        * default_options:                  returns a hash of (possibly multilevel) defaults for the options on which depend the rest of the configuration

        * pipeline_create_commands:         returns a list of strings that will be executed as system commands needed to create and set up the pipeline database

        * pipeline_wide_parameters:         returns a hash of pipeline-wide parameter names and their values

        * resource_classes:                 returns a hash of resource class definitions

        * pipeline_analyses:                returns a list of hash structures that define analysis objects bundled with definitions of corresponding jobs, rules and resources
Leo Gordon's avatar
Leo Gordon committed
29

30
        * beekeeper_extra_cmdline_options   returns a string with command line options that you want to be passed to the beekeeper.pl
Leo Gordon's avatar
Leo Gordon committed
31

32 33 34
    When defining anything except the keys of default_options() a call to $self->o('myoption') can be used.
    This call means "substitute this call for the value of 'myoption' at the time of configuring the pipeline".
    All option names mentioned in $self->o() calls within the five interface methods above can be given non-default values from the command line.
Leo Gordon's avatar
Leo Gordon committed
35

36
    Please make sure you have studied the pipeline configuraton examples in Bio::EnsEMBL::Hive::PipeConfig before creating your own PipeConfig modules.
Leo Gordon's avatar
Leo Gordon committed
37

38
=head1 LICENSE
Leo Gordon's avatar
Leo Gordon committed
39

40
    Copyright [1999-2014] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
41

42 43
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
Leo Gordon's avatar
Leo Gordon committed
44

45
         http://www.apache.org/licenses/LICENSE-2.0
Leo Gordon's avatar
Leo Gordon committed
46

47 48 49
    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.
Leo Gordon's avatar
Leo Gordon committed
50 51 52

=head1 CONTACT

53
    Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.
Leo Gordon's avatar
Leo Gordon committed
54 55 56

=cut

57 58 59 60 61

package Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf;

use strict;
use warnings;
62

63
use Bio::EnsEMBL::ApiVersion ();
64

65
use Bio::EnsEMBL::Hive::Utils::URL;
66
use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
67
use Bio::EnsEMBL::Hive::DBSQL::SqlSchemaAdaptor;
68
use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor;
69
use Bio::EnsEMBL::Hive::Analysis;
70
use Bio::EnsEMBL::Hive::AnalysisStats;
71
use Bio::EnsEMBL::Hive::Extensions;
72
use Bio::EnsEMBL::Hive::Valley;
73

Leo Gordon's avatar
Leo Gordon committed
74 75
use base ('Bio::EnsEMBL::Hive::DependentOptions');

76

77 78
# ---------------------------[the following methods will be overridden by specific pipelines]-------------------------

79

Leo Gordon's avatar
Leo Gordon committed
80 81 82 83 84 85 86
=head2 default_options

    Description : Interface method that should return a hash of option_name->default_option_value pairs.
                  Please see existing PipeConfig modules for examples.

=cut

87 88 89
sub default_options {
    my ($self) = @_;
    return {
90 91 92 93 94 95
            # Please note: ENVironment variables may be "exported" to inherit from enclosing shell,
            # but if you want to *prevent* that you need to specifically say so
            #  (setting a password to empty string does exactly that - sets it to an empty string)
            #
            #   [bash]      export -n ENSEMBL_CVS_ROOT_DIR  # will stop exporting, but the value in current shell stays as it was
            #   [tcsh]      unsetenv ENSEMBL_CVS_ROOT_DIR   # will destroy the variable even in current shell, and stop exporting
96

97 98
        'ensembl_cvs_root_dir'  => $ENV{'ENSEMBL_CVS_ROOT_DIR'} || $self->o('ensembl_cvs_root_dir'),    # it will make sense to set this variable if you are going to use ehive with ensembl
        'ensembl_release'       => Bio::EnsEMBL::ApiVersion::software_version(),                        # snapshot of EnsEMBL Core API version. Please do not change if not sure.
99

100 101
        'hive_root_dir'         => $ENV{'EHIVE_ROOT_DIR'}                                               # this value is set up automatically if this code is run by init_pipeline.pl
                                    || $self->o('ensembl_cvs_root_dir').'/ensembl-hive',                # otherwise we have to rely on other means
102

103
        'hive_driver'           => 'mysql',
104
        'host'                  => $ENV{'EHIVE_HOST'} || 'localhost',                                   # BEWARE that 'localhost' for mysql driver usually means a UNIX socket, not a TCPIP socket!
105
                                                                                                        # If you need to connect to TCPIP socket, set  -host => '127.0.0.1' instead.
106 107

        'port'                  => $ENV{'EHIVE_PORT'},                                                  # or remain undef, which means default for the driver
108 109 110
        'user'                  => $ENV{'EHIVE_USER'} || 'ensadmin',
        'password'              => $ENV{'EHIVE_PASS'} // $ENV{'ENSADMIN_PSW'} // $self->o('password'),  # people will have to make an effort NOT to insert it into config files like .bashrc etc
        'dbowner'               => $ENV{'EHIVE_USER'} || $ENV{'USER'}         || $self->o('dbowner'),   # although it is very unlikely $ENV{USER} is not set
111
        'pipeline_name'         => $self->pipeline_name(),
112

Leo Gordon's avatar
Leo Gordon committed
113
        'hive_use_triggers'     => 0,                   # there have been a few cases of big pipelines misbehaving with triggers on, let's keep the default off.
114
        'hive_use_param_stack'  => 0,                   # do not reconstruct the calling stack of parameters by default (yet)
115 116
        'hive_force_init'       => 0,                   # setting it to 1 will drop the database prior to creation (use with care!)

117
        'pipeline_db'   => {
118
            -driver => $self->o('hive_driver'),
119
            -host   => $self->o('host'),
120 121
            -port   => $self->o('port'),
            -user   => $self->o('user'),
122
            -pass   => $self->o('password'),
123
            -dbname => $self->o('dbowner').'_'.$self->o('pipeline_name'),  # example of a linked definition (resolved via saturation)
124 125 126 127
        },
    };
}

128

Leo Gordon's avatar
Leo Gordon committed
129 130 131 132 133 134 135
=head2 pipeline_create_commands

    Description : Interface method that should return a list of command lines to be run in order to create and set up the pipeline database.
                  Please see existing PipeConfig modules for examples.

=cut

136
sub pipeline_create_commands {
137
    my $self    = shift @_;
138

139 140 141 142
    my $pipeline_url    = $self->pipeline_url();
    my $parsed_url      = Bio::EnsEMBL::Hive::Utils::URL::parse( $pipeline_url );
    my $driver          = $parsed_url ? $parsed_url->{'driver'} : '';

143
    return [
144 145
            $self->o('hive_force_init') ? $self->db_cmd('DROP DATABASE IF EXISTS') : (),
            $self->db_cmd('CREATE DATABASE'),
146

147
                # we got table definitions for all drivers:
148
            $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/tables.'.$driver,
149

150
                # auto-sync'ing triggers are off by default and not yet available in pgsql:
151
            $self->o('hive_use_triggers') && ($driver ne 'pgsql')  ? ( $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/triggers.'.$driver ) : (),
152 153

                # FOREIGN KEY constraints cannot be defined in sqlite separately from table definitions, so they are off there:
154
                                             ($driver ne 'sqlite') ? ( $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/foreign_keys.sql' ) : (),
155 156

                # we got procedure definitions for all drivers:
157
            $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/procedures.'.$driver,
158
    ];
159 160
}

161

Leo Gordon's avatar
Leo Gordon committed
162 163 164 165 166 167 168 169
=head2 pipeline_wide_parameters

    Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs.
                  The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically).
                  Please see existing PipeConfig modules for examples.

=cut

170 171 172
sub pipeline_wide_parameters {
    my ($self) = @_;
    return {
173
        'schema_version' => $self->o('ensembl_release'),    # keep compatibility with core API
174 175 176
    };
}

177

Leo Gordon's avatar
Leo Gordon committed
178 179 180 181 182 183 184
=head2 resource_classes

    Description : Interface method that should return a hash of resource_description_id->resource_description_hash.
                  Please see existing PipeConfig modules for examples.

=cut

185 186 187
sub resource_classes {
    my ($self) = @_;
    return {
188 189 190 191 192 193
## Old style:
#        1 => { -desc => 'default',  'LSF' => '' },
#        2 => { -desc => 'urgent',   'LSF' => '-q yesterday' },
## New style:
        'default' => { 'LSF' => '' },
        'urgent'  => { 'LSF' => '-q yesterday' },
194 195 196
    };
}

197

Leo Gordon's avatar
Leo Gordon committed
198 199 200 201 202 203 204
=head2 pipeline_analyses

    Description : Interface method that should return a list of hashes that define analysis bundled with corresponding jobs, dataflow and analysis_ctrl rules and resource_id.
                  Please see existing PipeConfig modules for examples.

=cut

205 206 207 208 209 210 211
sub pipeline_analyses {
    my ($self) = @_;
    return [
    ];
}


212 213 214 215 216 217 218 219 220 221 222 223 224
=head2 beekeeper_extra_cmdline_options

    Description : Interface method that should return a string with extra parameters that you want to be passed to beekeeper.pl

=cut

sub beekeeper_extra_cmdline_options {
    my ($self) = @_;

    return '';
}


225 226 227
# ---------------------------------[now comes the interfacing stuff - feel free to call but not to modify]--------------------


228 229 230 231
sub hive_meta_table {
    my ($self) = @_;

    return {
232 233
        'hive_sql_schema_version'   => Bio::EnsEMBL::Hive::DBSQL::SqlSchemaAdaptor->get_code_sql_schema_version(),
        'hive_pipeline_name'        => $self->o('pipeline_name'),
234
        'hive_use_param_stack'      => $self->o('hive_use_param_stack'),
235 236 237
    };
}

Leo Gordon's avatar
Leo Gordon committed
238
sub pre_options {
239 240
    my $self = shift @_;

Leo Gordon's avatar
Leo Gordon committed
241 242 243 244
    return {
        'help!' => '',
        'job_topup!' => '',
        'analysis_topup!' => '',
245
        'pipeline_url' => '',
246
#        'hive_use_triggers' => '',
Leo Gordon's avatar
Leo Gordon committed
247
    };
248 249
}

250

Leo Gordon's avatar
Leo Gordon committed
251 252
=head2 dbconn_2_mysql

253
    Description : Deprecated method. Please use $self->db_cmd() instead.
Leo Gordon's avatar
Leo Gordon committed
254 255 256

=cut

257 258 259
sub dbconn_2_mysql {    # will save you a lot of typing
    my ($self, $db_conn, $with_db) = @_;

260
    warn "\nDEPRECATED: dbconn_2_mysql() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
261 262 263

    my $port = $self->o($db_conn,'-port');

264
    return '--host='.$self->o($db_conn,'-host').' '
265
          .($port ? '--port='.$self->o($db_conn,'-port').' ' : '')
266
          .'--user="'.$self->o($db_conn,'-user').'" '
267
          .'--password="'.$self->o($db_conn,'-pass').'" '
268
          .($with_db ? ($self->o($db_conn,'-dbname').' ') : '');
269 270
}

271

272 273
=head2 dbconn_2_pgsql

274
    Description : Deprecated method. Please use $self->db_cmd() instead.
275 276 277 278 279 280

=cut

sub dbconn_2_pgsql {    # will save you a lot of typing
    my ($self, $db_conn, $with_db) = @_;

281
    warn "\nDEPRECATED: dbconn_2_pgsql() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
282 283 284

    my $port = $self->o($db_conn,'-port');

285
    return '--host='.$self->o($db_conn,'-host').' '
286
          .($port ? '--port='.$self->o($db_conn,'-port').' ' : '')
287 288 289 290
          .'--username="'.$self->o($db_conn,'-user').'" '
          .($with_db ? ($self->o($db_conn,'-dbname').' ') : '');
}

291 292
=head2 db_connect_command

293
    Description : Deprecated method. Please use $self->db_cmd() instead.
294 295 296 297 298 299

=cut

sub db_connect_command {
    my ($self, $db_conn) = @_;

300
    warn "\nDEPRECATED: db_connect_command() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
301

302 303
    my $driver = $self->o($db_conn, '-driver');

304 305 306
    return {
        'sqlite'    => 'sqlite3 '.$self->o($db_conn, '-dbname'),
        'mysql'     => 'mysql '.$self->dbconn_2_mysql($db_conn, 1),
307 308
        'pgsql'     => "env PGPASSWORD='".$self->o($db_conn,'-pass')."' psql ".$self->dbconn_2_pgsql($db_conn, 1),
    }->{ $driver };
309 310 311 312 313
}


=head2 db_execute_command

314
    Description : Deprecated method. Please use $self->db_cmd() instead.
315 316 317 318

=cut

sub db_execute_command {
319
    my ($self, $db_conn, $sql_command, $with_db) = @_;
320

321
    warn "\nDEPRECATED: db_execute_command() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
322

323 324
    $with_db = 1 unless(defined($with_db));

325 326 327 328 329 330 331 332 333 334 335 336
    my $driver = $self->o($db_conn, '-driver');

    if(($driver eq 'sqlite') && !$with_db) {    # in these special cases we pretend sqlite can understand these commands
        return "rm -f $1" if($sql_command=~/DROP\s+DATABASE\s+(?:IF\s+EXISTS\s+)?(\w+)/);
        return "touch $1" if($sql_command=~/CREATE\s+DATABASE\s+(\w+)/);
    } else {
        return {
            'sqlite'    => 'sqlite3 '.$self->o($db_conn, '-dbname')." '$sql_command'",
            'mysql'     => 'mysql '.$self->dbconn_2_mysql($db_conn, $with_db)." -e '$sql_command'",
            'pgsql'     => "env PGPASSWORD='".$self->o($db_conn,'-pass')."' psql --command='$sql_command' ".$self->dbconn_2_pgsql($db_conn, $with_db),
        }->{ $driver };
    }
337 338 339
}


Leo Gordon's avatar
Leo Gordon committed
340 341
=head2 dbconn_2_url

342
    Description :  A convenience method used to stringify a connection-parameters hash into a 'pipeline_url' that beekeeper.pl will undestand
Leo Gordon's avatar
Leo Gordon committed
343 344 345

=cut

346
sub dbconn_2_url {
347 348 349
    my ($self, $db_conn, $with_db) = @_;

    $with_db = 1 unless(defined($with_db));
350

351
    my $driver = $self->o($db_conn, '-driver');
352
    my $port   = $self->o($db_conn,'-port');
353

354 355
    return (    ($driver eq 'sqlite')
            ? $driver.':///'
356
            : $driver.'://'.$self->o($db_conn,'-user').':'.$self->o($db_conn,'-pass').'@'.$self->o($db_conn,'-host').($port ? ':'.$port : '').'/'
357
           ) . ($with_db ? $self->o($db_conn,'-dbname') : '');
358 359
}

360

Leo Gordon's avatar
Leo Gordon committed
361 362 363
sub pipeline_url {
    my $self = shift @_;

364
    return $self->root()->{'pipeline_url'} || $self->dbconn_2_url('pipeline_db', 1); # used to force vivification of the whole 'pipeline_db' structure (used in run() )
Leo Gordon's avatar
Leo Gordon committed
365
}
366

367

368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
=head2 db_cmd

    Description :  Returns a db_cmd.pl-based command line that should execute by any supported driver (mysql/pgsql/sqlite)

=cut

sub db_cmd {
    my ($self, $sql_command, $db_url) = @_;

    $db_url //= $self->pipeline_url();
    my $db_cmd_path = $self->o('hive_root_dir').'/scripts/db_cmd.pl';

    return "$db_cmd_path -url $db_url".($sql_command ? " -sql '$sql_command'" : '');
}


384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
sub pipeline_name {
    my $self            = shift @_;
    my $pipeline_name   = shift @_;

    unless($pipeline_name) {    # or turn the ClassName into pipeline_name:
        $pipeline_name = ref($self);        # get the original class name
        $pipeline_name=~s/^.*:://;          # trim the leading classpath prefix
        $pipeline_name=~s/_conf$//;         # trim the optional _conf from the end
    }

    $pipeline_name=~s/([[:lower:]])([[:upper:]])/${1}_${2}/g;   # CamelCase into Camel_Case

    return lc($pipeline_name);
}


Leo Gordon's avatar
Leo Gordon committed
400 401 402 403 404 405 406 407 408 409 410 411 412
=head2 process_options

    Description : The method that does all the parameter parsing magic.
                  It is two-pass through the interface methods: first pass collects the options, second is intelligent substitution.

    Caller      : init_pipeline.pl or any other script that will drive this module.

    Note        : You can override parsing the command line bit by providing a hash as the argument to this method.
                  This hash should contain definitions of all the parameters you would otherwise be providing from the command line.
                  Useful if you are creating batches of hive pipelines using a script.

=cut

413
sub process_options {
Leo Gordon's avatar
Leo Gordon committed
414
    my $self = shift @_;
415

Leo Gordon's avatar
Leo Gordon committed
416 417
        # pre-patch definitely_used_options:
    $self->{'_extra_options'} = $self->load_cmdline_options( $self->pre_options() );
418
    $self->root()->{'pipeline_url'} = $self->{'_extra_options'}{'pipeline_url'};
419

420
    $self->use_cases( [ 'pipeline_create_commands', 'pipeline_wide_parameters', 'resource_classes', 'pipeline_analyses', 'beekeeper_extra_cmdline_options', 'pipeline_url', 'hive_meta_table' ] );
Leo Gordon's avatar
Leo Gordon committed
421
    return $self->SUPER::process_options();
422 423
}

Leo Gordon's avatar
Leo Gordon committed
424

Leo Gordon's avatar
Leo Gordon committed
425 426 427 428 429 430 431 432
=head2 run

    Description : The method that uses the Hive/EnsEMBL API to actually create all the analyses, jobs, dataflow and control rules and resource descriptions.

    Caller      : init_pipeline.pl or any other script that will drive this module.

=cut

433
sub run {
434
    my $self  = shift @_;
435 436 437
    my $analysis_topup  = $self->{'_extra_options'}{'analysis_topup'};
    my $job_topup       = $self->{'_extra_options'}{'job_topup'};
    my $pipeline_url    = $self->pipeline_url();
438
    my $pipeline_name   = $self->o('pipeline_name');
439

440
    unless($analysis_topup || $job_topup) {
441 442 443 444 445 446 447 448 449 450
        foreach my $cmd (@{$self->pipeline_create_commands}) {
            warn "Running the command:\n\t$cmd\n";
            if(my $retval = system($cmd)) {
                die "Return value = $retval, possibly an error\n";
            } else {
                warn "Done.\n\n";
            }
        }
    }

451
    Bio::EnsEMBL::Registry->no_version_check(1);
452
    my $hive_dba                     = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new( -url => $pipeline_url, -no_sql_schema_version_check => 1 );
453
    my $resource_class_adaptor       = $hive_dba->get_ResourceClassAdaptor;
454

455
    unless($job_topup) {
456 457 458 459 460 461 462 463
        my $meta_adaptor = $hive_dba->get_MetaAdaptor;      # the new adaptor for 'hive_meta' table
        warn "Loading hive_meta table ...\n";
        my $hive_meta_table = $self->hive_meta_table;
        while( my($meta_key, $meta_value) = each %$hive_meta_table ) {
            $meta_adaptor->store_pair( $meta_key, $meta_value );
        }

        my $meta_container = $hive_dba->get_MetaContainer;  # adaptor over core's 'meta' table for compatibility with core API
464 465 466 467 468
        warn "Loading pipeline-wide parameters ...\n";

        my $pipeline_wide_parameters = $self->pipeline_wide_parameters;
        while( my($meta_key, $meta_value) = each %$pipeline_wide_parameters ) {
            if($analysis_topup) {
469
                $meta_container->remove_all_by_meta_key($meta_key);
470
            }
471
            $meta_container->store_pair($meta_key, $meta_value);
472
        }
473 474
        warn "Done.\n\n";

475 476 477
            # pre-load resource_class and resource_description tables:
        my $resource_description_adaptor    = $hive_dba->get_ResourceDescriptionAdaptor;
        warn "Loading the Resources ...\n";
478

479 480
        my $resource_classes_hash = $self->resource_classes;
        my @resource_classes_order = sort { ($b eq 'default') or -($a eq 'default') or ($a cmp $b) } keys %$resource_classes_hash; # put 'default' to the front
481
        my %seen_resource_name = ();
482 483
        foreach my $rc_id (@resource_classes_order) {
            my $mt2param = $resource_classes_hash->{$rc_id};
484

485
            my $rc_name = delete $mt2param->{-desc};
486
            if($rc_id!~/^\d+$/) {
487
                $rc_name  = $rc_id;
488
                $rc_id = undef;
489 490
            }

491
            if(!$rc_name or $seen_resource_name{lc($rc_name)}++) {
492 493 494
                die "Every resource has to have a unique description, please fix the PipeConfig file";
            }

495
            my ($rc, $rc_newly_created) = $resource_class_adaptor->create_new(
496
                defined($rc_id) ? (-DBID   => $rc_id) : (),
497
                -NAME   => $rc_name,
498
                1   # check whether this ResourceClass was already present in the database
499
            );
500
            $rc_id = $rc->dbID();
501

502 503 504 505 506
            if($rc_newly_created) {
                warn "Creating resource_class $rc_name($rc_id).\n";
            } else {
                warn "Attempt to re-create and potentially redefine resource_class $rc_name($rc_id). NB: This may affect already created analyses!\n";
            }
507

508 509 510
            while( my($meadow_type, $resource_param_list) = each %$mt2param ) {
                $resource_param_list = [ $resource_param_list ] unless(ref($resource_param_list));  # expecting either a scalar or a 2-element array

511
                $resource_description_adaptor->create_new(
512 513 514 515
                    -resource_class_id      => $rc_id,
                    -meadow_type            => $meadow_type,
                    -submission_cmd_args    => $resource_param_list->[0],
                    -worker_cmd_args        => $resource_param_list->[1],
516 517
                );
            }
518
        }
519 520
        unless(my $default_rc = $resource_class_adaptor->fetch_by_name('default')) {
            warn "\tNB:'default' resource class is not in the database (did you forget to inherit from SUPER::resource_classes ?) - creating it for you\n";
521 522
            $resource_class_adaptor->create_new(-NAME => 'default');
        }
523
        warn "Done.\n\n";
524 525 526
    }

    my $analysis_adaptor             = $hive_dba->get_AnalysisAdaptor;
527
    my $analysis_stats_adaptor       = $hive_dba->get_AnalysisStatsAdaptor;
528

529 530
    my $valley = Bio::EnsEMBL::Hive::Valley->new( {}, 'LOCAL' );

531 532
    my %seen_logic_name = ();

533
    foreach my $aha (@{$self->pipeline_analyses}) {
534 535
        my ($logic_name, $module, $parameters_hash, $input_ids, $blocked, $batch_size, $hive_capacity, $failed_job_tolerance,
                $max_retry_count, $can_be_empty, $rc_id, $rc_name, $priority, $meadow_type, $analysis_capacity)
536
         = @{$aha}{qw(-logic_name -module -parameters -input_ids -blocked -batch_size -hive_capacity -failed_job_tolerance
537
                 -max_retry_count -can_be_empty -rc_id -rc_name -priority -meadow_type -analysis_capacity)};   # slicing a hash reference
538

539 540
        unless($logic_name) {
            die "logic_name' must be defined in every analysis";
541 542
        }

543 544 545
        if($seen_logic_name{$logic_name}++) {
            die "an entry with logic_name '$logic_name' appears at least twice in the configuration file, can't continue";
        }
546

547 548
        my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name);
        if( $analysis ) {
549

550 551 552 553
            if($analysis_topup) {
                warn "Skipping creation of already existing analysis '$logic_name'.\n";
                next;
            }
554 555 556

        } else {

557 558 559 560 561
            if($job_topup) {
                die "Could not fetch analysis '$logic_name'";
            }

            warn "Creating analysis '$logic_name'.\n";
562

563 564
            if($rc_id) {
                warn "(-rc_id => $rc_id) syntax is deprecated, please start using (-rc_name => 'your_resource_class_name')";
565 566 567
            } else {
                $rc_name ||= 'default';
                my $rc = $resource_class_adaptor->fetch_by_name($rc_name ) or die "Could not fetch resource with name '$rc_name', please check that resource_classes() method of your PipeConfig either contain it or inherit from the parent class";
568 569 570
                $rc_id = $rc->dbID();
            }

571 572 573 574
            if ($meadow_type and not exists $valley->available_meadow_hash()->{$meadow_type}) {
                die "The meadow '$meadow_type' is currently not registered (analysis '$logic_name')\n";
            }

575 576 577
            $parameters_hash ||= {};    # in case nothing was given
            die "'-parameters' has to be a hash" unless(ref($parameters_hash) eq 'HASH');

578
            $analysis = Bio::EnsEMBL::Hive::Analysis->new(
579 580
                -logic_name             => $logic_name,
                -module                 => $module,
581
                -parameters             => $parameters_hash,
582 583 584 585 586
                -resource_class_id      => $rc_id,
                -failed_job_tolerance   => $failed_job_tolerance,
                -max_retry_count        => $max_retry_count,
                -can_be_empty           => $can_be_empty,
                -priority               => $priority,
587
                -meadow_type            => $meadow_type,
588
                -analysis_capacity      => $analysis_capacity,
589
            );
590
            $analysis->get_compiled_module_name();  # check if it compiles and is named correctly
591 592
            $analysis_adaptor->store($analysis);

593 594 595 596
            my $stats = Bio::EnsEMBL::Hive::AnalysisStats->new(
                -analysis_id            => $analysis->dbID,
                -batch_size             => $batch_size,
                -hive_capacity          => $hive_capacity,
597
                -status                 => $blocked ? 'BLOCKED' : 'EMPTY',  # be careful, as this "soft" way of blocking may be accidentally unblocked by deep sync
598 599
            );
            $analysis_stats_adaptor->store($stats);
600
        }
601 602

            # now create the corresponding jobs (if there are any):
603
        foreach my $input_id_hash (@{$input_ids || []}) {
604 605 606 607

            Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor->CreateNewJob(
                -input_id       => $input_id_hash,  # input_ids are now centrally stringified in the AnalysisJobAdaptor
                -analysis       => $analysis,
608
                -prev_job_id    => undef, # these jobs are created by the initialization script, not by another job
609 610 611 612
            );
        }
    }

613
    unless($job_topup) {
614

615 616 617 618
            # Now, run separately through the already created analyses and link them together:
            #
        my $ctrl_rule_adaptor            = $hive_dba->get_AnalysisCtrlRuleAdaptor;
        my $dataflow_rule_adaptor        = $hive_dba->get_DataflowRuleAdaptor;
619

620
        foreach my $aha (@{$self->pipeline_analyses}) {
621 622
            my ($logic_name, $wait_for, $flow_into)
                 = @{$aha}{qw(-logic_name -wait_for -flow_into)};   # slicing a hash reference
623

624
            my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name);
625

626 627 628 629
            $wait_for ||= [];
            $wait_for   = [ $wait_for ] unless(ref($wait_for) eq 'ARRAY'); # force scalar into an arrayref

                # create control rules:
630
            foreach my $condition_url (@$wait_for) {
631 632 633
                unless ($condition_url =~ m{^\w*://}) {
                    my $condition_analysis = $analysis_adaptor->fetch_by_logic_name($condition_url);
                    die "Could not fetch analysis '$condition_url' to create a control rule (in '".($analysis->logic_name)."')\n" unless defined $condition_analysis;
634
                }
635 636 637 638 639 640 641
                my $c_rule = Bio::EnsEMBL::Hive::AnalysisCtrlRule->new(
                        -condition_analysis_url => $condition_url,
                        -ctrled_analysis_id     => $analysis->dbID,
                );
                $ctrl_rule_adaptor->store( $c_rule, 1 );

                warn $c_rule->toString."\n";
642 643
            }

644 645
            $flow_into ||= {};
            $flow_into   = { 1 => $flow_into } unless(ref($flow_into) eq 'HASH'); # force non-hash into a hash
646

647
            my %group_tag_to_funnel_dataflow_rule_id = ();
648

649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
            my $semaphore_sign = '->';

            my @all_branch_tags = keys %$flow_into;
            foreach my $branch_tag ((grep {/^[A-Z]$semaphore_sign/} @all_branch_tags), (grep {/$semaphore_sign[A-Z]$/} @all_branch_tags), (grep {!/$semaphore_sign/} @all_branch_tags)) {

                my ($branch_name_or_code, $group_role, $group_tag);

                if($branch_tag=~/^([A-Z])$semaphore_sign(-?\w+)$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($2, 'funnel', $1);
                } elsif($branch_tag=~/^(-?\w+)$semaphore_sign([A-Z])$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($1, 'fan', $2);
                } elsif($branch_tag=~/^(-?\w+)$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($1, '');
                } elsif($branch_tag=~/:/) {
                    die "Please use newer '2${semaphore_sign}A' and 'A${semaphore_sign}1' notation instead of '2:1' and '1'\n";
                } else {
                    die "Error parsing the group tag '$branch_tag'\n";
                }
667

668 669 670 671 672 673 674 675 676 677
                my $funnel_dataflow_rule_id = undef;    # NULL by default

                if($group_role eq 'fan') {
                    unless($funnel_dataflow_rule_id = $group_tag_to_funnel_dataflow_rule_id{$group_tag}) {
                        die "No funnel dataflow_rule defined for group '$group_tag'\n";
                    }
                }

                my $heirs = $flow_into->{$branch_tag};
                $heirs = [ $heirs ] unless(ref($heirs)); # force scalar into an arrayref first
678
                $heirs = { map { ($_ => undef) } @$heirs } if(ref($heirs) eq 'ARRAY'); # now force it into a hash if it wasn't
679

680
                while(my ($heir_url, $input_id_template_list) = each %$heirs) {
681 682 683 684 685

                    unless ($heir_url =~ m{^\w*://}) {
                        my $heir_analysis = $analysis_adaptor->fetch_by_logic_name($heir_url);
                        die "No analysis named '$heir_url' (dataflow from analysis '".($analysis->logic_name)."')\n" unless defined $heir_analysis;
                    }
686 687
                    
                    $input_id_template_list = [ $input_id_template_list ] unless(ref($input_id_template_list) eq 'ARRAY');  # allow for more than one template per analysis
688

689
                    foreach my $input_id_template (@$input_id_template_list) {
690

691 692 693
                        my $df_rule = Bio::EnsEMBL::Hive::DataflowRule->new(
                            -from_analysis              => $analysis,
                            -to_analysis_url            => $heir_url,
694
                            -branch_code                => $branch_name_or_code,
695 696 697 698 699 700
                            -input_id_template          => $input_id_template,
                            -funnel_dataflow_rule_id    => $funnel_dataflow_rule_id,
                        );
                        $dataflow_rule_adaptor->store( $df_rule, 1 );

                        warn $df_rule->toString."\n";
701 702 703 704 705

                        if($group_role eq 'funnel') {
                            if($group_tag_to_funnel_dataflow_rule_id{$group_tag}) {
                                die "More than one funnel dataflow_rule defined for group '$group_tag'\n";
                            } else {
706
                                $group_tag_to_funnel_dataflow_rule_id{$group_tag} = $df_rule->dbID();
707 708
                            }
                        }
709 710 711
                    } # /for all templates
                } # /for all heirs
            } # /for all branch_tags
712 713 714
        }
    }

715 716 717
    print "\n\n# --------------------[Useful commands]--------------------------\n";
    print "\n";
    print " # It is convenient to store the pipeline url in a variable:\n";
718
    print "\texport EHIVE_URL=$pipeline_url\t\t\t# bash version\n";
719
    print "(OR)\n";
720
    print "\tsetenv EHIVE_URL $pipeline_url\t\t\t# [t]csh version\n";
721 722
    print "\n";
    print " # Add a new job to the pipeline (usually done once before running, but pipeline can be \"topped-up\" at any time) :\n";
723
    print "\tseed_pipeline.pl -url $pipeline_url -logic_name <analysis_name> -input_id <param_hash>\n";
724 725
    print "\n";
    print " # Synchronize the Hive (should be done before [re]starting a pipeline) :\n";
726
    print "\tbeekeeper.pl -url $pipeline_url -sync\n";
727 728
    print "\n";
    print " # Run the pipeline (can be interrupted and restarted) :\n";
729
    print "\tbeekeeper.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()." -loop\t\t# run in looped automatic mode (a scheduling step performed every minute)\n";
Leo Gordon's avatar
Leo Gordon committed
730
    print "(OR)\n";
731
    print "\tbeekeeper.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()." -run \t\t# run one scheduling step of the pipeline and exit (useful for debugging/learning)\n";
732
    print "(OR)\n";
733
    print "\trunWorker.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()."      \t\t# run exactly one Worker locally (useful for debugging/learning)\n";
734 735
    print "\n";
    print " # At any moment during or after execution you can request a pipeline diagram in an image file (desired format is set via extension) :\n";
736
    print "\tgenerate_graph.pl -url $pipeline_url -out $pipeline_name.png\n";
737
    print "\n";
738 739 740 741 742 743
    print " # If you are running the pipeline on LSF, you can collect actual resource usage statistics :\n";
    print "\tlsf_report.pl -url $pipeline_url\n";
    print "\n";
    print " # After having run lsf_report.pl, you can request a resource usage timeline in an image file (desired format is set via extension) :\n";
    print "\tgenerate_timeline.pl -url $pipeline_url -out timeline_$pipeline_name.png\n";
    print "\n";
744
    print " # Peek into your pipeline database with a database client (useful to have open while the pipeline is running) :\n";
745
    print "\tdb_cmd.pl -url $pipeline_url\n\n";
746 747 748
}

1;
Leo Gordon's avatar
Leo Gordon committed
749