LSF.pm 4.19 KB
Newer Older
1 2 3 4 5 6
# This is the 'LSF' implementation of Meadow

package Bio::EnsEMBL::Hive::Meadow::LSF;

use strict;

Leo Gordon's avatar
Leo Gordon committed
7
use base ('Bio::EnsEMBL::Hive::Meadow');
8

9

10
sub name {  # also called to check for availability; assume LSF is available if LSF cluster_name can be established
11 12

    my $mcni = 'My cluster name is';
13
    if(my $name = `lsid 2>/dev/null | grep '$mcni' `) {
14
        $name=~/^$mcni\s+(\S+)/;
15 16 17 18
        return $1;
    }
}

19

20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
sub get_current_worker_process_id {
    my ($self) = @_;

    my $lsb_jobid    = $ENV{'LSB_JOBID'};
    my $lsb_jobindex = $ENV{'LSB_JOBINDEX'};

    if(defined($lsb_jobid) and defined($lsb_jobindex)) {
        if($lsb_jobindex>0) {
            return "$lsb_jobid\[$lsb_jobindex\]";
        } else {
            return $lsb_jobid;
        }
    } else {
        die "Could not establish the process_id";
    }
}

37

38
sub count_pending_workers_by_rc_name {
Leo Gordon's avatar
Leo Gordon committed
39
    my ($self) = @_;
40

41
    my $jnp = $self->job_name_prefix();
42
    my $cmd = "bjobs -w -J '${jnp}*' -u all 2>/dev/null | grep PEND";
Leo Gordon's avatar
Leo Gordon committed
43

44 45
    my %pending_this_meadow_by_rc_name = ();
    my $total_pending_this_meadow = 0;
46

47
    foreach my $line (qx/$cmd/) {
48
        if($line=~/\b\Q$jnp\E(\S+)\-\d+(\[\d+\])?\b/) {
49 50
            $pending_this_meadow_by_rc_name{$1}++;
            $total_pending_this_meadow++;
51 52 53
        }
    }

54
    return (\%pending_this_meadow_by_rc_name, $total_pending_this_meadow);
55 56
}

57

58 59 60 61 62 63 64 65 66 67 68 69 70
sub count_running_workers {
    my ($self) = @_;

    my $jnp = $self->job_name_prefix();
    my $cmd = "bjobs -w -J '${jnp}*' -u all 2>/dev/null | grep RUN | wc -l";

    my $run_count = qx/$cmd/;
    chomp($run_count);

    return $run_count;
}


71
sub status_of_all_our_workers { # returns a hashref
Leo Gordon's avatar
Leo Gordon committed
72 73
    my ($self) = @_;

74
    my $jnp = $self->job_name_prefix();
75
    my $cmd = "bjobs -w -J '${jnp}*' -u all 2>/dev/null";
Leo Gordon's avatar
Leo Gordon committed
76 77 78

    my %status_hash = ();
    foreach my $line (`$cmd`) {
Leo Gordon's avatar
bugfix  
Leo Gordon committed
79 80
        my ($group_pid, $user, $status, $queue, $submission_host, $running_host, $job_name) = split(/\s+/, $line);

81 82
        next if(($group_pid eq 'JOBID') or ($status eq 'DONE') or ($status eq 'EXIT'));

Leo Gordon's avatar
bugfix  
Leo Gordon committed
83 84 85 86 87
        my $worker_pid = $group_pid;
        if($job_name=~/(\[\d+\])/) {
            $worker_pid .= $1;
        }
            
Leo Gordon's avatar
Leo Gordon committed
88 89 90 91 92
        $status_hash{$worker_pid} = $status;
    }
    return \%status_hash;
}

93

94
sub check_worker_is_alive_and_mine {
95 96
    my ($self, $worker) = @_;

97 98 99 100 101 102
    my $wpid = $worker->process_id();
    my $this_user = $ENV{'USER'};
    my $cmd = qq{bjobs $wpid -u $this_user 2>&1 | grep -v 'not found' | grep -v JOBID | grep -v EXIT};

    my $is_alive_and_mine = qx/$cmd/;
    return $is_alive_and_mine;
103 104
}

105

106
sub kill_worker {
107
    my $worker = pop @_;
108

109 110
    my $cmd = 'bkill '.$worker->process_id();
    system($cmd);
111 112
}

113

114 115 116 117 118 119 120 121 122 123 124
sub find_out_causes {
    my $self = shift @_;

    my %lsf_2_hive = (
        'TERM_MEMLIMIT' => 'MEMLIMIT',
        'TERM_RUNLIMIT' => 'RUNLIMIT',
        'TERM_OWNER'    => 'KILLED_BY_USER',
    );

    my %cod = ();

125
    while (my $pid_batch = join(' ', map { "'$_'" } splice(@_, 0, 20))) {  # can't fit too many pids on one shell cmdline
126
        my $bacct_output = `bacct -l $pid_batch`;
127

128
        foreach my $section (split(/\-{10,}\s+/, $bacct_output)) {
129
            if($section=~/^Job <(\d+(?:\[\d+\])?)>.+(TERM_MEMLIMIT|TERM_RUNLIMIT|TERM_OWNER): job killed/is) {
130 131
                $cod{$1} = $lsf_2_hive{$2};
            }
132
        }
133 134
    }

135
    return \%cod;
136 137
}

138

139
sub submit_workers {
140
    my ($self, $worker_cmd, $required_worker_count, $iteration, $rc_name, $rc_specific_submission_cmd_args, $submit_stdout_file, $submit_stderr_file) = @_;
141

142 143
    my $job_name                            = $self->generate_job_name($required_worker_count, $iteration, $rc_name);
    my $meadow_specific_submission_cmd_args = $self->config_get('SubmissionOptions');
Leo Gordon's avatar
Leo Gordon committed
144

145 146
    $submit_stdout_file ||= '/dev/null';    # a value is required
    $submit_stderr_file ||= '/dev/null';    # a value is required
147

148 149
    $ENV{'LSB_STDOUT_DIRECT'} = 'y';  # unbuffer the output of the bsub command

150
    my $cmd = qq{bsub -o $submit_stdout_file -e $submit_stderr_file -J "${job_name}" $rc_specific_submission_cmd_args $meadow_specific_submission_cmd_args $worker_cmd};
151 152

    print "SUBMITTING_CMD:\t\t$cmd\n";
153
    system($cmd) && die "Could not submit job(s): $!, $?";  # let's abort the beekeeper and let the user check the syntax
154 155 156
}

1;