LSF.pm 3.61 KB
Newer Older
1 2 3 4 5 6
# This is the 'LSF' implementation of Meadow

package Bio::EnsEMBL::Hive::Meadow::LSF;

use strict;

Leo Gordon's avatar
Leo Gordon committed
7
use base ('Bio::EnsEMBL::Hive::Meadow');
8

9 10 11 12 13 14

sub available {     # always invoked as a class method

    return `which bjobs 2>/dev/null`;
}

15 16 17 18 19 20 21 22 23
sub name {

    my $mcni = 'My cluster name is';
    if(my $name = `lsid | grep '$mcni'`) {
        $name=~/^$mcni\s+(\w+)/;
        return $1;
    }
}

24

25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
sub get_current_worker_process_id {
    my ($self) = @_;

    my $lsb_jobid    = $ENV{'LSB_JOBID'};
    my $lsb_jobindex = $ENV{'LSB_JOBINDEX'};

    if(defined($lsb_jobid) and defined($lsb_jobindex)) {
        if($lsb_jobindex>0) {
            return "$lsb_jobid\[$lsb_jobindex\]";
        } else {
            return $lsb_jobid;
        }
    } else {
        die "Could not establish the process_id";
    }
}

42
sub count_pending_workers_by_rc_id {
Leo Gordon's avatar
Leo Gordon committed
43
    my ($self) = @_;
44

45
    my $jnp = $self->job_name_prefix();
46
    my $cmd = qq{bjobs -w -J '${jnp}*' -u all 2>/dev/null | grep PEND};
Leo Gordon's avatar
Leo Gordon committed
47

48
    my %pending_by_rc_id = ();
49

50 51 52 53 54 55 56
    foreach my $line (qx/$cmd/) {
        if($line=~/Hive(\d+)/) {    # FIXME: should be safer to match against $jnp instead of 'Hive'
            $pending_by_rc_id{$1}++;
        }
    }

    return \%pending_by_rc_id;
57 58
}

59
sub status_of_all_our_workers { # returns a hashref
Leo Gordon's avatar
Leo Gordon committed
60 61
    my ($self) = @_;

62
    my $jnp = $self->job_name_prefix();
Leo Gordon's avatar
Leo Gordon committed
63
    my $cmd = qq{bjobs -w -J '${jnp}*' -u all 2>/dev/null | grep -v JOBID | grep -v DONE | grep -v EXIT};
Leo Gordon's avatar
Leo Gordon committed
64 65 66

    my %status_hash = ();
    foreach my $line (`$cmd`) {
Leo Gordon's avatar
bugfix  
Leo Gordon committed
67 68 69 70 71 72 73
        my ($group_pid, $user, $status, $queue, $submission_host, $running_host, $job_name) = split(/\s+/, $line);

        my $worker_pid = $group_pid;
        if($job_name=~/(\[\d+\])/) {
            $worker_pid .= $1;
        }
            
Leo Gordon's avatar
Leo Gordon committed
74 75 76 77 78
        $status_hash{$worker_pid} = $status;
    }
    return \%status_hash;
}

79
sub check_worker_is_alive_and_mine {
80 81
    my ($self, $worker) = @_;

82 83 84 85 86 87
    my $wpid = $worker->process_id();
    my $this_user = $ENV{'USER'};
    my $cmd = qq{bjobs $wpid -u $this_user 2>&1 | grep -v 'not found' | grep -v JOBID | grep -v EXIT};

    my $is_alive_and_mine = qx/$cmd/;
    return $is_alive_and_mine;
88 89 90 91 92
}

sub kill_worker {
    my ($self, $worker) = @_;

93
    if($self->check_worker_is_alive_and_mine($worker)) {
94 95 96 97 98 99 100
        my $cmd = 'bkill '.$worker->process_id();
        system($cmd);
    } else {
        warn 'Cannot kill worker '.$worker->process_id().' because it is not running';
    }
}

101 102 103 104 105 106 107 108 109 110 111
sub find_out_causes {
    my $self = shift @_;

    my %lsf_2_hive = (
        'TERM_MEMLIMIT' => 'MEMLIMIT',
        'TERM_RUNLIMIT' => 'RUNLIMIT',
        'TERM_OWNER'    => 'KILLED_BY_USER',
    );

    my %cod = ();

112 113
    while (my $pid_batch = join(' ', splice(@_, 0, 20))) {  # can't fit too many pids on one shell cmdline
        my $bacct_output = `bacct -l $pid_batch`;
114

115
        foreach my $section (split(/\-{10,}\s+/, $bacct_output)) {
116
            if($section=~/^Job <(\d+(?:\[\d+\])?)>.+(TERM_MEMLIMIT|TERM_RUNLIMIT|TERM_OWNER): job killed/is) {
117 118
                $cod{$1} = $lsf_2_hive{$2};
            }
119
        }
120 121
    }

122
    return \%cod;
123 124
}

125
sub submit_workers {
Leo Gordon's avatar
Leo Gordon committed
126
    my ($self, $iteration, $worker_cmd, $worker_count, $rc_id, $rc_parameters) = @_;
127

128 129
    my $job_name       = $self->generate_job_name($worker_count, $iteration, $rc_id);
    my $meadow_options = $self->meadow_options();
Leo Gordon's avatar
Leo Gordon committed
130

131 132
    $ENV{'LSB_STDOUT_DIRECT'} = 'y';  # unbuffer the output of the bsub command

Leo Gordon's avatar
Leo Gordon committed
133
    my $cmd = qq{bsub -o /dev/null -J "${job_name}" $rc_parameters $meadow_options $worker_cmd -rc_id $rc_id};
134 135

    print "SUBMITTING_CMD:\t\t$cmd\n";
136
    system($cmd) && die "Could not submit job(s): $!, $?";  # let's abort the beekeeper and let the user check the syntax
137 138 139
}

1;