LSF.pm 3.45 KB
Newer Older
1 2 3 4 5 6
# This is the 'LSF' implementation of Meadow

package Bio::EnsEMBL::Hive::Meadow::LSF;

use strict;

Leo Gordon's avatar
Leo Gordon committed
7
use base ('Bio::EnsEMBL::Hive::Meadow');
8

9

10
sub name {  # also called to check for availability; assume LSF is available if LSF cluster_name can be established
11 12

    my $mcni = 'My cluster name is';
13
    if(my $name = `lsid 2>/dev/null | grep '$mcni' `) {
14 15 16 17 18
        $name=~/^$mcni\s+(\w+)/;
        return $1;
    }
}

19

20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
sub get_current_worker_process_id {
    my ($self) = @_;

    my $lsb_jobid    = $ENV{'LSB_JOBID'};
    my $lsb_jobindex = $ENV{'LSB_JOBINDEX'};

    if(defined($lsb_jobid) and defined($lsb_jobindex)) {
        if($lsb_jobindex>0) {
            return "$lsb_jobid\[$lsb_jobindex\]";
        } else {
            return $lsb_jobid;
        }
    } else {
        die "Could not establish the process_id";
    }
}

37

38
sub count_pending_workers_by_rc_name {
Leo Gordon's avatar
Leo Gordon committed
39
    my ($self) = @_;
40

41
    my $jnp = $self->job_name_prefix();
42
    my $cmd = qq{bjobs -w -J '${jnp}*' -u all 2>/dev/null | grep PEND};
Leo Gordon's avatar
Leo Gordon committed
43

44
    my %pending_by_rc_name = ();
45

46
    foreach my $line (qx/$cmd/) {
47 48
        if($line=~/\b\Q$jnp\E(.+)\-\d+\b/) {
            $pending_by_rc_name{$1}++;
49 50 51
        }
    }

52
    return \%pending_by_rc_name;
53 54
}

55

56
sub status_of_all_our_workers { # returns a hashref
Leo Gordon's avatar
Leo Gordon committed
57 58
    my ($self) = @_;

59
    my $jnp = $self->job_name_prefix();
Leo Gordon's avatar
Leo Gordon committed
60
    my $cmd = qq{bjobs -w -J '${jnp}*' -u all 2>/dev/null | grep -v JOBID | grep -v DONE | grep -v EXIT};
Leo Gordon's avatar
Leo Gordon committed
61 62 63

    my %status_hash = ();
    foreach my $line (`$cmd`) {
Leo Gordon's avatar
bugfix  
Leo Gordon committed
64 65 66 67 68 69 70
        my ($group_pid, $user, $status, $queue, $submission_host, $running_host, $job_name) = split(/\s+/, $line);

        my $worker_pid = $group_pid;
        if($job_name=~/(\[\d+\])/) {
            $worker_pid .= $1;
        }
            
Leo Gordon's avatar
Leo Gordon committed
71 72 73 74 75
        $status_hash{$worker_pid} = $status;
    }
    return \%status_hash;
}

76

77
sub check_worker_is_alive_and_mine {
78 79
    my ($self, $worker) = @_;

80 81 82 83 84 85
    my $wpid = $worker->process_id();
    my $this_user = $ENV{'USER'};
    my $cmd = qq{bjobs $wpid -u $this_user 2>&1 | grep -v 'not found' | grep -v JOBID | grep -v EXIT};

    my $is_alive_and_mine = qx/$cmd/;
    return $is_alive_and_mine;
86 87
}

88

89
sub kill_worker {
90
    my $worker = pop @_;
91

92 93
    my $cmd = 'bkill '.$worker->process_id();
    system($cmd);
94 95
}

96

97 98 99 100 101 102 103 104 105 106 107
sub find_out_causes {
    my $self = shift @_;

    my %lsf_2_hive = (
        'TERM_MEMLIMIT' => 'MEMLIMIT',
        'TERM_RUNLIMIT' => 'RUNLIMIT',
        'TERM_OWNER'    => 'KILLED_BY_USER',
    );

    my %cod = ();

108 109
    while (my $pid_batch = join(' ', splice(@_, 0, 20))) {  # can't fit too many pids on one shell cmdline
        my $bacct_output = `bacct -l $pid_batch`;
110

111
        foreach my $section (split(/\-{10,}\s+/, $bacct_output)) {
112
            if($section=~/^Job <(\d+(?:\[\d+\])?)>.+(TERM_MEMLIMIT|TERM_RUNLIMIT|TERM_OWNER): job killed/is) {
113 114
                $cod{$1} = $lsf_2_hive{$2};
            }
115
        }
116 117
    }

118
    return \%cod;
119 120
}

121

122
sub submit_workers {
123
    my ($self, $worker_cmd, $required_worker_count, $iteration, $rc_name, $rc_parameters) = @_;
124

125
    my $job_name       = $self->generate_job_name($required_worker_count, $iteration, $rc_name);
126
    my $submission_options = $self->config_get('SubmissionOptions');
Leo Gordon's avatar
Leo Gordon committed
127

128 129
    $ENV{'LSB_STDOUT_DIRECT'} = 'y';  # unbuffer the output of the bsub command

Leo Gordon's avatar
Leo Gordon committed
130
    my $cmd = qq{bsub -o /dev/null -J "${job_name}" $rc_parameters $submission_options $worker_cmd};
131 132

    print "SUBMITTING_CMD:\t\t$cmd\n";
133
    system($cmd) && die "Could not submit job(s): $!, $?";  # let's abort the beekeeper and let the user check the syntax
134 135 136
}

1;