SystemCmd.pm 8.79 KB
Newer Older
1 2 3 4
=pod 

=head1 NAME

5
    Bio::EnsEMBL::Hive::RunnableDB::SystemCmd
6

Leo Gordon's avatar
Leo Gordon committed
7 8
=head1 SYNOPSIS

9
    standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::SystemCmd --cmd 'ls -1 ${ENSEMBL_CVS_ROOT_DIR}/ensembl-hive/modules/Bio/EnsEMBL/Hive/RunnableDB/*.pm >building_blocks.list'
Leo Gordon's avatar
Leo Gordon committed
10

11 12
=head1 DESCRIPTION

13
    This RunnableDB module acts as a wrapper for shell-level command lines. If you behave you may also use parameter substitution.
14

15 16
    The command line must be stored in the parameters() as the value corresponding to the 'cmd' key.
    It allows to pass in other parameters and use the parameter substitution mechanism in its full glory.
17

18 19 20 21 22 23 24 25 26 27 28 29 30
    This Runnable also allows the creation of dataflow using JSON stored in an external file.
    Each line of this file contains an optional branch number, followed by a complete JSON serialisation of the parameters (output_id)
    appearing on the same single line. For example, a line to direct dataflow on branch 2 might look like:

          2 {"parameter_name" : "parameter_value"}

    If no branch number is provided, then dataflow of those parameters will occour on the branch number
    passed to SystemCmd in the 'dataflow_branch' parameter, if given. Otherwise, it will default to
    branch 1 (autoflow).

    A sample file is provided at ${EHIVE_ROOT_DIR}/modules/Bio/EnsEMBL/Hive/Examples/SystemCmd/PipeConfig/sample_files/Inject_JSON_Dataflow_example.json

=head1 CONFIGURATION EXAMPLES
31 32 33 34 35 36 37 38 39 40 41

    # The following example shows how to configure SystemCmd in a PipeConfig module
    # to create a MySQL snapshot of the Hive database before executing a critical operation.
    #
    # It is a useful incantation when debugging pipelines, similar to setting a breakpoint/savepoint.
    # You will be able to reset your pipeline to the saved point in by un-dumping this file.

        {   -logic_name => 'db_snapshot_before_critical_A',
            -module     => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
            -parameters => {
                'filename'  => $ENV{'HOME'}.'/db_snapshot_before_critical_A',
42
                'cmd'       => $self->db_cmd().' --executable mysqldump > #filename#',
43 44
            },
        },
45

46 47 48 49 50 51 52 53 54 55 56
    # The following example shows how to configure SystemCmd in a PipeConfig module
    # to generate dataflow events based on parameters stored as JSON in a file named "some_parameters.json"

        {  -logic_name => 'inject_parameters_from_file',
           -module     => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
           -parameters => {
                'dataflow_file' => 'some_parameters.json',
                'cmd'           => 'sleep 0', # a command must be provided in the cmd parameter
           },
        },

57 58
=head1 LICENSE

59
    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
nwillhoft's avatar
nwillhoft committed
60
    Copyright [2016-2021] EMBL-European Bioinformatics Institute
61 62 63 64 65 66 67 68 69 70

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.

71
=head1 CONTACT
72

73
    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
74 75 76

=cut

77

78 79 80
package Bio::EnsEMBL::Hive::RunnableDB::SystemCmd;

use strict;
81
use warnings;
82

83
use base ('Bio::EnsEMBL::Hive::Process');
84

Leo Gordon's avatar
Leo Gordon committed
85

86 87 88
sub param_defaults {
    return {
        return_codes_2_branches => {},      # Hash that maps some of the command return codes to branch numbers
89
        'use_bash_pipefail' => 0,           # Boolean. When true, the command will be run with "bash -o pipefail -c $cmd". Useful to capture errors in a command that contains pipes
90
        'use_bash_errexit'  => 0,           # When the command is composed of multiple commands (concatenated with a semi-colon), use "bash -o errexit" so that a failure will interrupt the whole script
91 92
        'dataflow_file'     => undef,       # The path to a file that contains 1 line per dataflow event, in the form of a JSON object
        'dataflow_branch'   => undef,       # The default branch for JSON dataflows
93
        'timeout'           => undef,       # Maximum runtime of the command
94 95 96 97
    }
}


Leo Gordon's avatar
Leo Gordon committed
98 99 100 101 102
=head2 run

    Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
                  Here it actually runs the command line.

103 104 105 106 107
    param('cmd'): The recommended way of passing in the command line. It can be either a string, or an array-ref of strings. The later is safer if some of the
                  arguments contain white-spaces.

    param('*'):   Any other parameters can be freely used for parameter substitution.

Leo Gordon's avatar
Leo Gordon committed
108 109
=cut

110 111
sub run {
    my $self = shift;
112
 
113
    my %transferred_options = map {$_ => $self->param($_)} qw(use_bash_pipefail use_bash_errexit timeout);
114
    my ($return_value, $stderr, $flat_cmd, $stdout, $runtime_msec) = $self->run_system_command($self->param_required('cmd'), \%transferred_options);
115

116 117 118 119
    # To be used in write_output()
    $self->param('return_value', $return_value);
    $self->param('stderr', $stderr);
    $self->param('flat_cmd', $flat_cmd);
120
    $self->param('stdout', $stdout);
121
    $self->param('runtime_msec', $runtime_msec);
122 123
}

Leo Gordon's avatar
Leo Gordon committed
124

Leo Gordon's avatar
Leo Gordon committed
125 126 127
=head2 write_output

    Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
128
                  Here we take actions based on the command's exit status.
129

Leo Gordon's avatar
Leo Gordon committed
130 131 132
=cut

sub write_output {
133 134 135
    my $self = shift;

    my $return_value = $self->param('return_value');
136

137 138 139 140 141 142 143 144
    ## Success
    unless ($return_value) {
        # FIXME branch number
        $self->dataflow_output_ids_from_json($self->param('dataflow_file'), $self->param('dataflow_branch')) if $self->param('dataflow_file');
        return;
    }

    ## Error processing
145 146 147
    my $stderr = $self->param('stderr');
    my $flat_cmd = $self->param('flat_cmd');

148
    if ($return_value == -1) {
149 150 151
        # system() could not start, or wait() failed
        die sprintf( "Could not start '%s': %s\n", $flat_cmd, $stderr);

152 153 154 155
    } elsif ($return_value == -2) {
        $self->complete_early_if_branch_connected("The command was aborted because it exceeded the allowed runtime. Flowing to the -2 branch.\n", -2);
        die "The command was aborted because it exceeded the allowed runtime, but there are no dataflow-rules on branch -2.\n";

156 157 158 159
    # Lower 8 bits indicate the process has been killed and did not complete.
    } elsif ($return_value & 255) {
        # It can happen because of a MEMLIMIT / RUNLIMIT, which we
        # know are not atomic. The best is to wait a bit that LSF kills
160 161
        # the worker too
        sleep 30;
162
        # If we reach this point, it was killed for another reason.
163 164
        die sprintf( "'%s' was killed with code=%d\nstderr is: %s\n", $flat_cmd, $return_value, $stderr);

165
    } else {
166
        # "Normal" process exit with a non-zero code (in the upper 8 bits)
167
        $return_value >>= 8;
168 169

        # We create a dataflow event depending on the exit code of the process.
170
        if (ref($self->param('return_codes_2_branches')) and exists $self->param('return_codes_2_branches')->{$return_value}) {
171
            my $branch_number = $self->param('return_codes_2_branches')->{$return_value};
172
            $self->complete_early(sprintf("The command exited with code %d, which is mapped to a dataflow on branch #%d.\n", $return_value, $branch_number), $branch_number);
173 174
        }

175
        if ($stderr =~ /Exception in thread ".*" java.lang.OutOfMemoryError: Java heap space at/) {
176 177
            $self->complete_early_if_branch_connected("Java heap space is out of memory. A job has been dataflown to the -1 branch.\n", -1);
            die $stderr;
178 179
        }

180 181
        die sprintf( "'%s' resulted in an error code=%d\nstderr is: %s\n", $flat_cmd, $return_value, $stderr);
    }
182 183
}

184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207

######################
## Internal methods ##
######################

=head2 complete_early_if_branch_connected

  Arg[1]      : (string) message
  Arg[2]      : (integer) branch number
  Description : Wrapper around complete_early that first checks that the
                branch is connected to something.
  Returntype  : void if the branch is not connected. Otherwise doesn't return

=cut

sub complete_early_if_branch_connected {
    my ($self, $message, $branch_code) = @_;

    # just return if no corresponding gc_dataflow rule has been defined
    return unless $self->input_job->analysis->dataflow_rules_by_branch->{$branch_code};

    $self->complete_early($message, $branch_code);
}

208
1;