HiveGeneric_conf.pm 31.3 KB
Newer Older
Leo Gordon's avatar
Leo Gordon committed
1 2 3 4 5

=pod 

=head1 NAME

Leo Gordon's avatar
Leo Gordon committed
6
Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf
Leo Gordon's avatar
Leo Gordon committed
7 8 9 10 11 12 13

=head1 SYNOPSIS

    # Example 1: specifying only the mandatory option:
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf -password <mypass>

    # Example 2: specifying the mandatory options as well as overriding some defaults:
14
init_pipeline.pl Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf -host <myhost> -dbname <mydbname> -password <mypass>
Leo Gordon's avatar
Leo Gordon committed
15 16 17 18 19 20

=head1 DESCRIPTION

Generic configuration module for all Hive pipelines with loader functionality.
All other Hive PipeConfig modules should inherit from this module and will probably need to redefine some or all of the following interface methods:

21
    * default_options:                  returns a hash of (possibly multilevel) defaults for the options on which depend the rest of the configuration
Leo Gordon's avatar
Leo Gordon committed
22

23
    * pipeline_create_commands:         returns a list of strings that will be executed as system commands needed to create and set up the pipeline database
Leo Gordon's avatar
Leo Gordon committed
24

25
    * pipeline_wide_parameters:         returns a hash of pipeline-wide parameter names and their values
Leo Gordon's avatar
Leo Gordon committed
26

27
    * resource_classes:                 returns a hash of resource class definitions
Leo Gordon's avatar
Leo Gordon committed
28

29 30 31
    * pipeline_analyses:                returns a list of hash structures that define analysis objects bundled with definitions of corresponding jobs, rules and resources

    * beekeeper_extra_cmdline_options   returns a string with command line options that you want to be passed to the beekeeper.pl
Leo Gordon's avatar
Leo Gordon committed
32 33 34 35 36 37 38 39 40 41 42 43 44

When defining anything except the keys of default_options() a call to $self->o('myoption') can be used.
This call means "substitute this call for the value of 'myoption' at the time of configuring the pipeline".
All option names mentioned in $self->o() calls within the five interface methods above can be given non-default values from the command line.

Please make sure you have studied the pipeline configuraton examples in Bio::EnsEMBL::Hive::PipeConfig before creating your own PipeConfig modules.

=head1 CONTACT

  Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.

=cut

45 46 47 48 49

package Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf;

use strict;
use warnings;
50

51
use Bio::EnsEMBL::ApiVersion ();
52 53

use Bio::EnsEMBL::Hive::Utils ('stringify');
54
use Bio::EnsEMBL::Hive::Utils::URL;
55
use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
56
use Bio::EnsEMBL::Hive::DBSQL::SqlSchemaAdaptor;
57
use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor;
58
use Bio::EnsEMBL::Hive::Analysis;
59
use Bio::EnsEMBL::Hive::AnalysisStats;
60
use Bio::EnsEMBL::Hive::Extensions;
61
use Bio::EnsEMBL::Hive::Valley;
62

Leo Gordon's avatar
Leo Gordon committed
63 64
use base ('Bio::EnsEMBL::Hive::DependentOptions');

65

66 67
# ---------------------------[the following methods will be overridden by specific pipelines]-------------------------

68

Leo Gordon's avatar
Leo Gordon committed
69 70 71 72 73 74 75
=head2 default_options

    Description : Interface method that should return a hash of option_name->default_option_value pairs.
                  Please see existing PipeConfig modules for examples.

=cut

76 77 78
sub default_options {
    my ($self) = @_;
    return {
79 80 81 82 83 84
            # Please note: ENVironment variables may be "exported" to inherit from enclosing shell,
            # but if you want to *prevent* that you need to specifically say so
            #  (setting a password to empty string does exactly that - sets it to an empty string)
            #
            #   [bash]      export -n ENSEMBL_CVS_ROOT_DIR  # will stop exporting, but the value in current shell stays as it was
            #   [tcsh]      unsetenv ENSEMBL_CVS_ROOT_DIR   # will destroy the variable even in current shell, and stop exporting
85

86 87
        'ensembl_cvs_root_dir'  => $ENV{'ENSEMBL_CVS_ROOT_DIR'} || $self->o('ensembl_cvs_root_dir'),    # it will make sense to set this variable if you are going to use ehive with ensembl
        'ensembl_release'       => Bio::EnsEMBL::ApiVersion::software_version(),                        # snapshot of EnsEMBL Core API version. Please do not change if not sure.
88

89 90
        'hive_root_dir'         => $ENV{'EHIVE_ROOT_DIR'}                                               # this value is set up automatically if this code is run by init_pipeline.pl
                                    || $self->o('ensembl_cvs_root_dir').'/ensembl-hive',                # otherwise we have to rely on other means
91

92
        'hive_driver'           => 'mysql',
93
        'host'                  => $ENV{'EHIVE_HOST'} || 'localhost',                                   # BEWARE that 'localhost' for mysql driver usually means a UNIX socket, not a TCPIP socket!
94
                                                                                                        # If you need to connect to TCPIP socket, set  -host => '127.0.0.1' instead.
95 96

        'port'                  => $ENV{'EHIVE_PORT'},                                                  # or remain undef, which means default for the driver
97 98 99
        'user'                  => $ENV{'EHIVE_USER'} || 'ensadmin',
        'password'              => $ENV{'EHIVE_PASS'} // $ENV{'ENSADMIN_PSW'} // $self->o('password'),  # people will have to make an effort NOT to insert it into config files like .bashrc etc
        'dbowner'               => $ENV{'EHIVE_USER'} || $ENV{'USER'}         || $self->o('dbowner'),   # although it is very unlikely $ENV{USER} is not set
100
        'pipeline_name'         => $self->pipeline_name(),
101

Leo Gordon's avatar
Leo Gordon committed
102
        'hive_use_triggers'     => 0,                   # there have been a few cases of big pipelines misbehaving with triggers on, let's keep the default off.
103
        'hive_use_param_stack'  => 0,                   # do not reconstruct the calling stack of parameters by default (yet)
104 105
        'hive_force_init'       => 0,                   # setting it to 1 will drop the database prior to creation (use with care!)

106
        'pipeline_db'   => {
107
            -driver => $self->o('hive_driver'),
108
            -host   => $self->o('host'),
109 110
            -port   => $self->o('port'),
            -user   => $self->o('user'),
111
            -pass   => $self->o('password'),
112
            -dbname => $self->o('dbowner').'_'.$self->o('pipeline_name'),  # example of a linked definition (resolved via saturation)
113 114 115 116
        },
    };
}

117

Leo Gordon's avatar
Leo Gordon committed
118 119 120 121 122 123 124
=head2 pipeline_create_commands

    Description : Interface method that should return a list of command lines to be run in order to create and set up the pipeline database.
                  Please see existing PipeConfig modules for examples.

=cut

125
sub pipeline_create_commands {
126
    my $self    = shift @_;
127

128 129 130 131
    my $pipeline_url    = $self->pipeline_url();
    my $parsed_url      = Bio::EnsEMBL::Hive::Utils::URL::parse( $pipeline_url );
    my $driver          = $parsed_url ? $parsed_url->{'driver'} : '';

132
    return [
133 134
            $self->o('hive_force_init') ? $self->db_cmd('DROP DATABASE IF EXISTS') : (),
            $self->db_cmd('CREATE DATABASE'),
135

136
                # we got table definitions for all drivers:
137
            $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/tables.'.$driver,
138

139
                # auto-sync'ing triggers are off by default and not yet available in pgsql:
140
            $self->o('hive_use_triggers') && ($driver ne 'pgsql')  ? ( $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/triggers.'.$driver ) : (),
141 142

                # FOREIGN KEY constraints cannot be defined in sqlite separately from table definitions, so they are off there:
143
                                             ($driver ne 'sqlite') ? ( $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/foreign_keys.sql' ) : (),
144 145

                # we got procedure definitions for all drivers:
146
            $self->db_cmd().' <'.$self->o('hive_root_dir').'/sql/procedures.'.$driver,
147
    ];
148 149
}

150

Leo Gordon's avatar
Leo Gordon committed
151 152 153 154 155 156 157 158
=head2 pipeline_wide_parameters

    Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs.
                  The value doesn't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically).
                  Please see existing PipeConfig modules for examples.

=cut

159 160 161
sub pipeline_wide_parameters {
    my ($self) = @_;
    return {
162
        'schema_version' => $self->o('ensembl_release'),    # keep compatibility with core API
163 164 165
    };
}

166

Leo Gordon's avatar
Leo Gordon committed
167 168 169 170 171 172 173
=head2 resource_classes

    Description : Interface method that should return a hash of resource_description_id->resource_description_hash.
                  Please see existing PipeConfig modules for examples.

=cut

174 175 176
sub resource_classes {
    my ($self) = @_;
    return {
177 178 179 180 181 182
## Old style:
#        1 => { -desc => 'default',  'LSF' => '' },
#        2 => { -desc => 'urgent',   'LSF' => '-q yesterday' },
## New style:
        'default' => { 'LSF' => '' },
        'urgent'  => { 'LSF' => '-q yesterday' },
183 184 185
    };
}

186

Leo Gordon's avatar
Leo Gordon committed
187 188 189 190 191 192 193
=head2 pipeline_analyses

    Description : Interface method that should return a list of hashes that define analysis bundled with corresponding jobs, dataflow and analysis_ctrl rules and resource_id.
                  Please see existing PipeConfig modules for examples.

=cut

194 195 196 197 198 199 200
sub pipeline_analyses {
    my ($self) = @_;
    return [
    ];
}


201 202 203 204 205 206 207 208 209 210 211 212 213
=head2 beekeeper_extra_cmdline_options

    Description : Interface method that should return a string with extra parameters that you want to be passed to beekeeper.pl

=cut

sub beekeeper_extra_cmdline_options {
    my ($self) = @_;

    return '';
}


214 215 216
# ---------------------------------[now comes the interfacing stuff - feel free to call but not to modify]--------------------


217 218 219 220
sub hive_meta_table {
    my ($self) = @_;

    return {
221 222
        'hive_sql_schema_version'   => Bio::EnsEMBL::Hive::DBSQL::SqlSchemaAdaptor->get_code_sql_schema_version(),
        'hive_pipeline_name'        => $self->o('pipeline_name'),
223
        'hive_use_param_stack'      => $self->o('hive_use_param_stack'),
224 225 226
    };
}

Leo Gordon's avatar
Leo Gordon committed
227
sub pre_options {
228 229
    my $self = shift @_;

Leo Gordon's avatar
Leo Gordon committed
230 231 232 233
    return {
        'help!' => '',
        'job_topup!' => '',
        'analysis_topup!' => '',
234
        'pipeline_url' => '',
235
#        'hive_use_triggers' => '',
Leo Gordon's avatar
Leo Gordon committed
236
    };
237 238
}

239

Leo Gordon's avatar
Leo Gordon committed
240 241
=head2 dbconn_2_mysql

242
    Description : Deprecated method. Please use $self->db_cmd() instead.
Leo Gordon's avatar
Leo Gordon committed
243 244 245

=cut

246 247 248
sub dbconn_2_mysql {    # will save you a lot of typing
    my ($self, $db_conn, $with_db) = @_;

249
    warn "\nDEPRECATED: dbconn_2_mysql() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
250 251 252

    my $port = $self->o($db_conn,'-port');

253
    return '--host='.$self->o($db_conn,'-host').' '
254
          .($port ? '--port='.$self->o($db_conn,'-port').' ' : '')
255 256
          .'--user="'.$self->o($db_conn,'-user').'" '
          .'--pass="'.$self->o($db_conn,'-pass').'" '
257
          .($with_db ? ($self->o($db_conn,'-dbname').' ') : '');
258 259
}

260

261 262
=head2 dbconn_2_pgsql

263
    Description : Deprecated method. Please use $self->db_cmd() instead.
264 265 266 267 268 269

=cut

sub dbconn_2_pgsql {    # will save you a lot of typing
    my ($self, $db_conn, $with_db) = @_;

270
    warn "\nDEPRECATED: dbconn_2_pgsql() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
271 272 273

    my $port = $self->o($db_conn,'-port');

274
    return '--host='.$self->o($db_conn,'-host').' '
275
          .($port ? '--port='.$self->o($db_conn,'-port').' ' : '')
276 277 278 279
          .'--username="'.$self->o($db_conn,'-user').'" '
          .($with_db ? ($self->o($db_conn,'-dbname').' ') : '');
}

280 281
=head2 db_connect_command

282
    Description : Deprecated method. Please use $self->db_cmd() instead.
283 284 285 286 287 288

=cut

sub db_connect_command {
    my ($self, $db_conn) = @_;

289
    warn "\nDEPRECATED: db_connect_command() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
290

291 292
    my $driver = $self->o($db_conn, '-driver');

293 294 295
    return {
        'sqlite'    => 'sqlite3 '.$self->o($db_conn, '-dbname'),
        'mysql'     => 'mysql '.$self->dbconn_2_mysql($db_conn, 1),
296 297
        'pgsql'     => "env PGPASSWORD='".$self->o($db_conn,'-pass')."' psql ".$self->dbconn_2_pgsql($db_conn, 1),
    }->{ $driver };
298 299 300 301 302
}


=head2 db_execute_command

303
    Description : Deprecated method. Please use $self->db_cmd() instead.
304 305 306 307

=cut

sub db_execute_command {
308
    my ($self, $db_conn, $sql_command, $with_db) = @_;
309

310
    warn "\nDEPRECATED: db_execute_command() method is no longer supported, please call db_cmd(\$sql_command) instead, it will be more portable\n\n";
311

312 313
    $with_db = 1 unless(defined($with_db));

314 315 316 317 318 319 320 321 322 323 324 325
    my $driver = $self->o($db_conn, '-driver');

    if(($driver eq 'sqlite') && !$with_db) {    # in these special cases we pretend sqlite can understand these commands
        return "rm -f $1" if($sql_command=~/DROP\s+DATABASE\s+(?:IF\s+EXISTS\s+)?(\w+)/);
        return "touch $1" if($sql_command=~/CREATE\s+DATABASE\s+(\w+)/);
    } else {
        return {
            'sqlite'    => 'sqlite3 '.$self->o($db_conn, '-dbname')." '$sql_command'",
            'mysql'     => 'mysql '.$self->dbconn_2_mysql($db_conn, $with_db)." -e '$sql_command'",
            'pgsql'     => "env PGPASSWORD='".$self->o($db_conn,'-pass')."' psql --command='$sql_command' ".$self->dbconn_2_pgsql($db_conn, $with_db),
        }->{ $driver };
    }
326 327 328
}


Leo Gordon's avatar
Leo Gordon committed
329 330
=head2 dbconn_2_url

331
    Description :  A convenience method used to stringify a connection-parameters hash into a 'pipeline_url' that beekeeper.pl will undestand
Leo Gordon's avatar
Leo Gordon committed
332 333 334

=cut

335
sub dbconn_2_url {
336 337 338
    my ($self, $db_conn, $with_db) = @_;

    $with_db = 1 unless(defined($with_db));
339

340
    my $driver = $self->o($db_conn, '-driver');
341
    my $port   = $self->o($db_conn,'-port');
342

343 344
    return (    ($driver eq 'sqlite')
            ? $driver.':///'
345
            : $driver.'://'.$self->o($db_conn,'-user').':'.$self->o($db_conn,'-pass').'@'.$self->o($db_conn,'-host').($port ? ':'.$port : '').'/'
346
           ) . ($with_db ? $self->o($db_conn,'-dbname') : '');
347 348
}

349

Leo Gordon's avatar
Leo Gordon committed
350 351 352
sub pipeline_url {
    my $self = shift @_;

353
    return $self->root()->{'pipeline_url'} || $self->dbconn_2_url('pipeline_db', 1); # used to force vivification of the whole 'pipeline_db' structure (used in run() )
Leo Gordon's avatar
Leo Gordon committed
354
}
355

356

357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
=head2 db_cmd

    Description :  Returns a db_cmd.pl-based command line that should execute by any supported driver (mysql/pgsql/sqlite)

=cut

sub db_cmd {
    my ($self, $sql_command, $db_url) = @_;

    $db_url //= $self->pipeline_url();
    my $db_cmd_path = $self->o('hive_root_dir').'/scripts/db_cmd.pl';

    return "$db_cmd_path -url $db_url".($sql_command ? " -sql '$sql_command'" : '');
}


373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
sub pipeline_name {
    my $self            = shift @_;
    my $pipeline_name   = shift @_;

    unless($pipeline_name) {    # or turn the ClassName into pipeline_name:
        $pipeline_name = ref($self);        # get the original class name
        $pipeline_name=~s/^.*:://;          # trim the leading classpath prefix
        $pipeline_name=~s/_conf$//;         # trim the optional _conf from the end
    }

    $pipeline_name=~s/([[:lower:]])([[:upper:]])/${1}_${2}/g;   # CamelCase into Camel_Case

    return lc($pipeline_name);
}


Leo Gordon's avatar
Leo Gordon committed
389 390 391 392 393 394 395 396 397 398 399 400 401
=head2 process_options

    Description : The method that does all the parameter parsing magic.
                  It is two-pass through the interface methods: first pass collects the options, second is intelligent substitution.

    Caller      : init_pipeline.pl or any other script that will drive this module.

    Note        : You can override parsing the command line bit by providing a hash as the argument to this method.
                  This hash should contain definitions of all the parameters you would otherwise be providing from the command line.
                  Useful if you are creating batches of hive pipelines using a script.

=cut

402
sub process_options {
Leo Gordon's avatar
Leo Gordon committed
403
    my $self = shift @_;
404

Leo Gordon's avatar
Leo Gordon committed
405 406
        # pre-patch definitely_used_options:
    $self->{'_extra_options'} = $self->load_cmdline_options( $self->pre_options() );
407
    $self->root()->{'pipeline_url'} = $self->{'_extra_options'}{'pipeline_url'};
408

409
    $self->use_cases( [ 'pipeline_create_commands', 'pipeline_wide_parameters', 'resource_classes', 'pipeline_analyses', 'beekeeper_extra_cmdline_options', 'pipeline_url', 'hive_meta_table' ] );
Leo Gordon's avatar
Leo Gordon committed
410
    return $self->SUPER::process_options();
411 412
}

Leo Gordon's avatar
Leo Gordon committed
413

Leo Gordon's avatar
Leo Gordon committed
414 415 416 417 418 419 420 421
=head2 run

    Description : The method that uses the Hive/EnsEMBL API to actually create all the analyses, jobs, dataflow and control rules and resource descriptions.

    Caller      : init_pipeline.pl or any other script that will drive this module.

=cut

422
sub run {
423
    my $self  = shift @_;
424 425 426
    my $analysis_topup  = $self->{'_extra_options'}{'analysis_topup'};
    my $job_topup       = $self->{'_extra_options'}{'job_topup'};
    my $pipeline_url    = $self->pipeline_url();
427
    my $pipeline_name   = $self->o('pipeline_name');
428

429
    unless($analysis_topup || $job_topup) {
430 431 432 433 434 435 436 437 438 439
        foreach my $cmd (@{$self->pipeline_create_commands}) {
            warn "Running the command:\n\t$cmd\n";
            if(my $retval = system($cmd)) {
                die "Return value = $retval, possibly an error\n";
            } else {
                warn "Done.\n\n";
            }
        }
    }

440
    Bio::EnsEMBL::Registry->no_version_check(1);
441
    my $hive_dba                     = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new( -url => $pipeline_url, -no_sql_schema_version_check => 1 );
442
    my $resource_class_adaptor       = $hive_dba->get_ResourceClassAdaptor;
443

444
    unless($job_topup) {
445 446 447 448 449 450 451 452
        my $meta_adaptor = $hive_dba->get_MetaAdaptor;      # the new adaptor for 'hive_meta' table
        warn "Loading hive_meta table ...\n";
        my $hive_meta_table = $self->hive_meta_table;
        while( my($meta_key, $meta_value) = each %$hive_meta_table ) {
            $meta_adaptor->store_pair( $meta_key, $meta_value );
        }

        my $meta_container = $hive_dba->get_MetaContainer;  # adaptor over core's 'meta' table for compatibility with core API
453 454 455 456 457
        warn "Loading pipeline-wide parameters ...\n";

        my $pipeline_wide_parameters = $self->pipeline_wide_parameters;
        while( my($meta_key, $meta_value) = each %$pipeline_wide_parameters ) {
            if($analysis_topup) {
458
                $meta_container->remove_all_by_meta_key($meta_key);
459
            }
460
            $meta_container->store_pair($meta_key, $meta_value);
461
        }
462 463
        warn "Done.\n\n";

464 465 466
            # pre-load resource_class and resource_description tables:
        my $resource_description_adaptor    = $hive_dba->get_ResourceDescriptionAdaptor;
        warn "Loading the Resources ...\n";
467

468 469
        my $resource_classes_hash = $self->resource_classes;
        my @resource_classes_order = sort { ($b eq 'default') or -($a eq 'default') or ($a cmp $b) } keys %$resource_classes_hash; # put 'default' to the front
470
        my %seen_resource_name = ();
471 472
        foreach my $rc_id (@resource_classes_order) {
            my $mt2param = $resource_classes_hash->{$rc_id};
473

474
            my $rc_name = delete $mt2param->{-desc};
475
            if($rc_id!~/^\d+$/) {
476
                $rc_name  = $rc_id;
477
                $rc_id = undef;
478 479
            }

480
            if(!$rc_name or $seen_resource_name{lc($rc_name)}++) {
481 482 483
                die "Every resource has to have a unique description, please fix the PipeConfig file";
            }

484
            my ($rc, $rc_newly_created) = $resource_class_adaptor->create_new(
485
                defined($rc_id) ? (-DBID   => $rc_id) : (),
486
                -NAME   => $rc_name,
487
                1   # check whether this ResourceClass was already present in the database
488
            );
489
            $rc_id = $rc->dbID();
490

491 492 493 494 495
            if($rc_newly_created) {
                warn "Creating resource_class $rc_name($rc_id).\n";
            } else {
                warn "Attempt to re-create and potentially redefine resource_class $rc_name($rc_id). NB: This may affect already created analyses!\n";
            }
496

497 498 499
            while( my($meadow_type, $resource_param_list) = each %$mt2param ) {
                $resource_param_list = [ $resource_param_list ] unless(ref($resource_param_list));  # expecting either a scalar or a 2-element array

500
                $resource_description_adaptor->create_new(
501 502 503 504
                    -resource_class_id      => $rc_id,
                    -meadow_type            => $meadow_type,
                    -submission_cmd_args    => $resource_param_list->[0],
                    -worker_cmd_args        => $resource_param_list->[1],
505 506
                );
            }
507
        }
508 509
        unless(my $default_rc = $resource_class_adaptor->fetch_by_name('default')) {
            warn "\tNB:'default' resource class is not in the database (did you forget to inherit from SUPER::resource_classes ?) - creating it for you\n";
510 511
            $resource_class_adaptor->create_new(-NAME => 'default');
        }
512
        warn "Done.\n\n";
513 514 515
    }

    my $analysis_adaptor             = $hive_dba->get_AnalysisAdaptor;
516
    my $analysis_stats_adaptor       = $hive_dba->get_AnalysisStatsAdaptor;
517

518 519
    my $valley = Bio::EnsEMBL::Hive::Valley->new( {}, 'LOCAL' );

520 521
    my %seen_logic_name = ();

522
    foreach my $aha (@{$self->pipeline_analyses}) {
523 524
        my ($logic_name, $module, $parameters_hash, $input_ids, $blocked, $batch_size, $hive_capacity, $failed_job_tolerance,
                $max_retry_count, $can_be_empty, $rc_id, $rc_name, $priority, $meadow_type, $analysis_capacity)
525
         = @{$aha}{qw(-logic_name -module -parameters -input_ids -blocked -batch_size -hive_capacity -failed_job_tolerance
526
                 -max_retry_count -can_be_empty -rc_id -rc_name -priority -meadow_type -analysis_capacity)};   # slicing a hash reference
527

528 529
        unless($logic_name) {
            die "logic_name' must be defined in every analysis";
530 531
        }

532 533 534
        if($seen_logic_name{$logic_name}++) {
            die "an entry with logic_name '$logic_name' appears at least twice in the configuration file, can't continue";
        }
535

536 537
        my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name);
        if( $analysis ) {
538

539 540 541 542
            if($analysis_topup) {
                warn "Skipping creation of already existing analysis '$logic_name'.\n";
                next;
            }
543 544 545

        } else {

546 547 548 549 550
            if($job_topup) {
                die "Could not fetch analysis '$logic_name'";
            }

            warn "Creating analysis '$logic_name'.\n";
551

552 553
            if($rc_id) {
                warn "(-rc_id => $rc_id) syntax is deprecated, please start using (-rc_name => 'your_resource_class_name')";
554 555 556
            } else {
                $rc_name ||= 'default';
                my $rc = $resource_class_adaptor->fetch_by_name($rc_name ) or die "Could not fetch resource with name '$rc_name', please check that resource_classes() method of your PipeConfig either contain it or inherit from the parent class";
557 558 559
                $rc_id = $rc->dbID();
            }

560 561 562 563
            if ($meadow_type and not exists $valley->available_meadow_hash()->{$meadow_type}) {
                die "The meadow '$meadow_type' is currently not registered (analysis '$logic_name')\n";
            }

564 565 566
            $parameters_hash ||= {};    # in case nothing was given
            die "'-parameters' has to be a hash" unless(ref($parameters_hash) eq 'HASH');

567
            $analysis = Bio::EnsEMBL::Hive::Analysis->new(
568 569
                -logic_name             => $logic_name,
                -module                 => $module,
570
                -parameters             => stringify($parameters_hash),    # have to stringify it here, because Analysis code is external wrt Hive code
571 572 573 574 575
                -resource_class_id      => $rc_id,
                -failed_job_tolerance   => $failed_job_tolerance,
                -max_retry_count        => $max_retry_count,
                -can_be_empty           => $can_be_empty,
                -priority               => $priority,
576
                -meadow_type            => $meadow_type,
577
                -analysis_capacity      => $analysis_capacity,
578
            );
579
            $analysis->get_compiled_module_name();  # check if it compiles and is named correctly
580 581
            $analysis_adaptor->store($analysis);

582 583 584 585
            my $stats = Bio::EnsEMBL::Hive::AnalysisStats->new(
                -analysis_id            => $analysis->dbID,
                -batch_size             => $batch_size,
                -hive_capacity          => $hive_capacity,
586
                -status                 => $blocked ? 'BLOCKED' : 'EMPTY',  # be careful, as this "soft" way of blocking may be accidentally unblocked by deep sync
587 588
            );
            $analysis_stats_adaptor->store($stats);
589
        }
590 591

            # now create the corresponding jobs (if there are any):
592
        foreach my $input_id_hash (@{$input_ids || []}) {
593 594 595 596

            Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor->CreateNewJob(
                -input_id       => $input_id_hash,  # input_ids are now centrally stringified in the AnalysisJobAdaptor
                -analysis       => $analysis,
597
                -prev_job_id    => undef, # these jobs are created by the initialization script, not by another job
598 599 600 601
            );
        }
    }

602
    unless($job_topup) {
603

604 605 606 607
            # Now, run separately through the already created analyses and link them together:
            #
        my $ctrl_rule_adaptor            = $hive_dba->get_AnalysisCtrlRuleAdaptor;
        my $dataflow_rule_adaptor        = $hive_dba->get_DataflowRuleAdaptor;
608

609
        foreach my $aha (@{$self->pipeline_analyses}) {
610 611
            my ($logic_name, $wait_for, $flow_into)
                 = @{$aha}{qw(-logic_name -wait_for -flow_into)};   # slicing a hash reference
612

613
            my $analysis = $analysis_adaptor->fetch_by_logic_name($logic_name);
614

615 616 617 618
            $wait_for ||= [];
            $wait_for   = [ $wait_for ] unless(ref($wait_for) eq 'ARRAY'); # force scalar into an arrayref

                # create control rules:
619
            foreach my $condition_url (@$wait_for) {
620 621 622
                unless ($condition_url =~ m{^\w*://}) {
                    my $condition_analysis = $analysis_adaptor->fetch_by_logic_name($condition_url);
                    die "Could not fetch analysis '$condition_url' to create a control rule (in '".($analysis->logic_name)."')\n" unless defined $condition_analysis;
623
                }
624 625 626 627 628 629 630
                my $c_rule = Bio::EnsEMBL::Hive::AnalysisCtrlRule->new(
                        -condition_analysis_url => $condition_url,
                        -ctrled_analysis_id     => $analysis->dbID,
                );
                $ctrl_rule_adaptor->store( $c_rule, 1 );

                warn $c_rule->toString."\n";
631 632
            }

633 634
            $flow_into ||= {};
            $flow_into   = { 1 => $flow_into } unless(ref($flow_into) eq 'HASH'); # force non-hash into a hash
635

636
            my %group_tag_to_funnel_dataflow_rule_id = ();
637

638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655
            my $semaphore_sign = '->';

            my @all_branch_tags = keys %$flow_into;
            foreach my $branch_tag ((grep {/^[A-Z]$semaphore_sign/} @all_branch_tags), (grep {/$semaphore_sign[A-Z]$/} @all_branch_tags), (grep {!/$semaphore_sign/} @all_branch_tags)) {

                my ($branch_name_or_code, $group_role, $group_tag);

                if($branch_tag=~/^([A-Z])$semaphore_sign(-?\w+)$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($2, 'funnel', $1);
                } elsif($branch_tag=~/^(-?\w+)$semaphore_sign([A-Z])$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($1, 'fan', $2);
                } elsif($branch_tag=~/^(-?\w+)$/) {
                    ($branch_name_or_code, $group_role, $group_tag) = ($1, '');
                } elsif($branch_tag=~/:/) {
                    die "Please use newer '2${semaphore_sign}A' and 'A${semaphore_sign}1' notation instead of '2:1' and '1'\n";
                } else {
                    die "Error parsing the group tag '$branch_tag'\n";
                }
656

657 658 659 660 661 662 663 664 665 666
                my $funnel_dataflow_rule_id = undef;    # NULL by default

                if($group_role eq 'fan') {
                    unless($funnel_dataflow_rule_id = $group_tag_to_funnel_dataflow_rule_id{$group_tag}) {
                        die "No funnel dataflow_rule defined for group '$group_tag'\n";
                    }
                }

                my $heirs = $flow_into->{$branch_tag};
                $heirs = [ $heirs ] unless(ref($heirs)); # force scalar into an arrayref first
667
                $heirs = { map { ($_ => undef) } @$heirs } if(ref($heirs) eq 'ARRAY'); # now force it into a hash if it wasn't
668

669
                while(my ($heir_url, $input_id_template_list) = each %$heirs) {
670 671 672 673 674

                    unless ($heir_url =~ m{^\w*://}) {
                        my $heir_analysis = $analysis_adaptor->fetch_by_logic_name($heir_url);
                        die "No analysis named '$heir_url' (dataflow from analysis '".($analysis->logic_name)."')\n" unless defined $heir_analysis;
                    }
675 676
                    
                    $input_id_template_list = [ $input_id_template_list ] unless(ref($input_id_template_list) eq 'ARRAY');  # allow for more than one template per analysis
677

678
                    foreach my $input_id_template (@$input_id_template_list) {
679

680 681 682 683 684 685 686 687 688 689
                        my $df_rule = Bio::EnsEMBL::Hive::DataflowRule->new(
                            -from_analysis              => $analysis,
                            -to_analysis_url            => $heir_url,
                            -branch_code                => $dataflow_rule_adaptor->branch_name_2_code( $branch_name_or_code ),
                            -input_id_template          => $input_id_template,
                            -funnel_dataflow_rule_id    => $funnel_dataflow_rule_id,
                        );
                        $dataflow_rule_adaptor->store( $df_rule, 1 );

                        warn $df_rule->toString."\n";
690 691 692 693 694

                        if($group_role eq 'funnel') {
                            if($group_tag_to_funnel_dataflow_rule_id{$group_tag}) {
                                die "More than one funnel dataflow_rule defined for group '$group_tag'\n";
                            } else {
695
                                $group_tag_to_funnel_dataflow_rule_id{$group_tag} = $df_rule->dbID();
696 697
                            }
                        }
698 699 700
                    } # /for all templates
                } # /for all heirs
            } # /for all branch_tags
701 702 703
        }
    }

704 705 706
    print "\n\n# --------------------[Useful commands]--------------------------\n";
    print "\n";
    print " # It is convenient to store the pipeline url in a variable:\n";
707
    print "\texport EHIVE_URL=$pipeline_url\t\t\t# bash version\n";
708
    print "(OR)\n";
709
    print "\tsetenv EHIVE_URL $pipeline_url\t\t\t# [t]csh version\n";
710 711
    print "\n";
    print " # Add a new job to the pipeline (usually done once before running, but pipeline can be \"topped-up\" at any time) :\n";
712
    print "\tseed_pipeline.pl -url $pipeline_url -logic_name <analysis_name> -input_id <param_hash>\n";
713 714
    print "\n";
    print " # Synchronize the Hive (should be done before [re]starting a pipeline) :\n";
715
    print "\tbeekeeper.pl -url $pipeline_url -sync\n";
716 717
    print "\n";
    print " # Run the pipeline (can be interrupted and restarted) :\n";
718
    print "\tbeekeeper.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()." -loop\t\t# run in looped automatic mode (a scheduling step performed every minute)\n";
Leo Gordon's avatar
Leo Gordon committed
719
    print "(OR)\n";
720
    print "\tbeekeeper.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()." -run \t\t# run one scheduling step of the pipeline and exit (useful for debugging/learning)\n";
721
    print "(OR)\n";
722
    print "\trunWorker.pl -url $pipeline_url ".$self->beekeeper_extra_cmdline_options()."      \t\t# run exactly one Worker locally (useful for debugging/learning)\n";
723 724
    print "\n";
    print " # At any moment during or after execution you can request a pipeline diagram in an image file (desired format is set via extension) :\n";
725
    print "\tgenerate_graph.pl -url $pipeline_url -out $pipeline_name.png\n";
726 727
    print "\n";
    print " # Peek into your pipeline database with a database client (useful to have open while the pipeline is running) :\n";
728
    print "\tdb_cmd.pl -url $pipeline_url\n\n";
729 730 731
}

1;
Leo Gordon's avatar
Leo Gordon committed
732