LSF.pm 6.54 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
=pod 

=head1 NAME

    Bio::EnsEMBL::Hive::Meadow::LSF

=head1 DESCRIPTION

    This is the 'LSF' implementation of Meadow

=head1 LICENSE

13
    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Matthieu Muffato's avatar
Matthieu Muffato committed
14
    Copyright [2016-2018] EMBL-European Bioinformatics Institute
15 16 17 18 19 20 21 22 23 24 25 26

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.

=head1 CONTACT

27
    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
28 29 30

=cut

31 32 33 34 35

package Bio::EnsEMBL::Hive::Meadow::LSF;

use strict;

36 37
use Bio::EnsEMBL::Hive::Utils ('split_for_bash');

Leo Gordon's avatar
Leo Gordon committed
38
use base ('Bio::EnsEMBL::Hive::Meadow');
39

40

41 42 43 44 45
our $VERSION = '1.0';       # Semantic version of the Meadow interface:
                            #   change the Major version whenever an incompatible change is introduced,
                            #   change the Minor version whenever the interface is extended, but compatibility is retained.


46
sub name {  # also called to check for availability; assume LSF is available if LSF cluster_name can be established
47
    my $mcni = 'My cluster name is';
48 49 50 51 52
    my $cmd = "lsid 2>/dev/null | grep '$mcni'";

#    warn "LSF::name() running cmd:\n\t$cmd\n";

    if(my $name = `$cmd`) {
53
        $name=~/^$mcni\s+(\S+)/;
54 55 56 57
        return $1;
    }
}

58

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
sub get_current_worker_process_id {
    my ($self) = @_;

    my $lsb_jobid    = $ENV{'LSB_JOBID'};
    my $lsb_jobindex = $ENV{'LSB_JOBINDEX'};

    if(defined($lsb_jobid) and defined($lsb_jobindex)) {
        if($lsb_jobindex>0) {
            return "$lsb_jobid\[$lsb_jobindex\]";
        } else {
            return $lsb_jobid;
        }
    } else {
        die "Could not establish the process_id";
    }
}

76

77
sub count_pending_workers_by_rc_name {
Leo Gordon's avatar
Leo Gordon committed
78
    my ($self) = @_;
79

80
    my $jnp = $self->job_name_prefix();
81 82
    my $cmd = "bjobs -w -J '${jnp}*' 2>/dev/null | grep PEND";  # "-u all" has been removed to ensure one user's PEND processes
                                                                #   do not affect another user helping to run the same pipeline.
Leo Gordon's avatar
Leo Gordon committed
83

84 85
#    warn "LSF::count_pending_workers_by_rc_name() running cmd:\n\t$cmd\n";

86 87
    my %pending_this_meadow_by_rc_name = ();
    my $total_pending_this_meadow = 0;
88

89
    foreach my $line (qx/$cmd/) {
90
        if($line=~/\b\Q$jnp\E(\S+)\-\d+(\[\d+\])?\b/) {
91 92
            $pending_this_meadow_by_rc_name{$1}++;
            $total_pending_this_meadow++;
93 94 95
        }
    }

96
    return (\%pending_this_meadow_by_rc_name, $total_pending_this_meadow);
97 98
}

99

100 101 102 103 104 105
sub count_running_workers {
    my ($self) = @_;

    my $jnp = $self->job_name_prefix();
    my $cmd = "bjobs -w -J '${jnp}*' -u all 2>/dev/null | grep RUN | wc -l";

106 107
#    warn "LSF::count_running_workers() running cmd:\n\t$cmd\n";

108
    my $run_count = qx/$cmd/;
109
    $run_count=~s/\s+//g;       # remove both leading and trailing spaces
110 111 112 113 114

    return $run_count;
}


115
sub status_of_all_our_workers { # returns a hashref
Leo Gordon's avatar
Leo Gordon committed
116 117
    my ($self) = @_;

118
    my $jnp = $self->job_name_prefix();
119
    my $cmd = "bjobs -w -J '${jnp}*' -u all 2>/dev/null";
Leo Gordon's avatar
Leo Gordon committed
120

121 122
#    warn "LSF::status_of_all_our_workers() running cmd:\n\t$cmd\n";

Leo Gordon's avatar
Leo Gordon committed
123 124
    my %status_hash = ();
    foreach my $line (`$cmd`) {
Leo Gordon's avatar
bugfix  
Leo Gordon committed
125 126
        my ($group_pid, $user, $status, $queue, $submission_host, $running_host, $job_name) = split(/\s+/, $line);

127 128
        next if(($group_pid eq 'JOBID') or ($status eq 'DONE') or ($status eq 'EXIT'));

Leo Gordon's avatar
bugfix  
Leo Gordon committed
129
        my $worker_pid = $group_pid;
130
        if($job_name=~/(\[\d+\])$/ and $worker_pid!~/\[\d+\]$/) {   # account for the difference in LSF 9.1.1.1 vs LSF 9.1.2.0  bjobs' output
Leo Gordon's avatar
bugfix  
Leo Gordon committed
131 132 133
            $worker_pid .= $1;
        }
            
Leo Gordon's avatar
Leo Gordon committed
134 135 136 137 138
        $status_hash{$worker_pid} = $status;
    }
    return \%status_hash;
}

139

140
sub check_worker_is_alive_and_mine {
141 142
    my ($self, $worker) = @_;

143 144 145 146
    my $wpid = $worker->process_id();
    my $this_user = $ENV{'USER'};
    my $cmd = qq{bjobs $wpid -u $this_user 2>&1 | grep -v 'not found' | grep -v JOBID | grep -v EXIT};

147 148
#    warn "LSF::check_worker_is_alive_and_mine() running cmd:\n\t$cmd\n";

149 150
    my $is_alive_and_mine = qx/$cmd/;
    return $is_alive_and_mine;
151 152
}

153

154
sub kill_worker {
155
    my $worker = pop @_;
156

157
    my $cmd = 'bkill '.$worker->process_id();
158 159 160

#    warn "LSF::kill_worker() running cmd:\n\t$cmd\n";

161
    system($cmd);
162 163
}

164

165 166 167 168
sub find_out_causes {
    my $self = shift @_;

    my %lsf_2_hive = (
169 170 171 172
        'TERM_MEMLIMIT'     => 'MEMLIMIT',
        'TERM_RUNLIMIT'     => 'RUNLIMIT',
        'TERM_OWNER'        => 'KILLED_BY_USER',    # bkill     (wait until it dies)
        'TERM_FORCE_OWNER'  => 'KILLED_BY_USER',    # bkill -r  (quick remove)
173 174 175 176
    );

    my %cod = ();

177
    while (my $pid_batch = join(' ', map { "'$_'" } splice(@_, 0, 20))) {  # can't fit too many pids on one shell cmdline
178
        my $cmd = "bacct -l $pid_batch";
179

180 181 182
#        warn "LSF::find_out_causes() running cmd:\n\t$cmd\n";

        foreach my $section (split(/\-{10,}\s+/, `$cmd`)) {
183
            if($section=~/^Job <(\d+(?:\[\d+\])?)>.+(TERM_\w+): job killed/is) {
184 185
                $cod{$1} = $lsf_2_hive{$2};
            }
186
        }
187 188
    }

189
    return \%cod;
190 191
}

192

193
sub submit_workers {
194
    my ($self, $worker_cmd, $required_worker_count, $iteration, $rc_name, $rc_specific_submission_cmd_args, $submit_stdout_file, $submit_stderr_file) = @_;
195

196 197
    my $job_name                            = $self->generate_job_name($required_worker_count, $iteration, $rc_name);
    my $meadow_specific_submission_cmd_args = $self->config_get('SubmissionOptions');
Leo Gordon's avatar
Leo Gordon committed
198

199 200
    $submit_stdout_file ||= '/dev/null';    # a value is required
    $submit_stderr_file ||= '/dev/null';    # a value is required
201

202 203
    $ENV{'LSB_STDOUT_DIRECT'} = 'y';  # unbuffer the output of the bsub command

204 205 206 207
    my @cmd = ('bsub',
        '-o', $submit_stdout_file,
        '-e', $submit_stderr_file,
        '-J', $job_name,
208 209
        split_for_bash($rc_specific_submission_cmd_args),
        split_for_bash($meadow_specific_submission_cmd_args),
210 211
        $worker_cmd
    );
212

213
    warn "LSF::submit_workers() running cmd:\n\t".join(' ', @cmd)."\n";
214

215
    system( @cmd ) && die "Could not submit job(s): $!, $?";  # let's abort the beekeeper and let the user check the syntax
216 217 218
}

1;