Commit 654f6481 authored by Leo Gordon's avatar Leo Gordon
Browse files

do all baccts in one go - should save some time

parent cbd22c2e
......@@ -67,12 +67,6 @@ sub kill_worker {
die "Please use a derived method";
}
sub find_out_cause { # parent assumes agnostic stance
my ($self, $worker_pid) = @_;
return;
}
# --------------[(combinable) means of adjusting the number of submitted workers]----------------------
sub total_running_workers_limit { # if set and ->can('count_running_workers'),
......
......@@ -77,22 +77,29 @@ sub kill_worker {
}
}
sub find_out_cause {
my ($self, $worker_pid) = @_;
my $diagnostic_output = `bacct -l '$worker_pid'`;
if($diagnostic_output=~/TERM_MEMLIMIT: job killed/i) {
return 'MEMLIMIT';
} elsif($diagnostic_output=~/TERM_RUNLIMIT: job killed/i) {
return 'RUNLIMIT';
} elsif($diagnostic_output=~/TERM_OWNER: job killed/i) {
return 'KILLED_BY_USER';
sub find_out_causes {
my $self = shift @_;
my %lsf_2_hive = (
'TERM_MEMLIMIT' => 'MEMLIMIT',
'TERM_RUNLIMIT' => 'RUNLIMIT',
'TERM_OWNER' => 'KILLED_BY_USER',
);
my %cod = ();
my $pid_batch = join(' ', @_); # FIXME: it should be done in several batches
my $ba_out = `bacct -l $pid_batch`;
foreach my $section (split(/\-{10,}\s+/, $ba_out)) {
if($section=~/^Job <(\d+(?:\[\d+\]))>.+(TERM_MEMLIMIT|TERM_RUNLIMIT|TERM_OWNER): job killed/is) {
$cod{$1} = $lsf_2_hive{$2};
}
}
return;
return \%cod;
}
sub submit_workers {
my ($self, $iteration, $worker_cmd, $worker_count, $rc_id, $rc_parameters) = @_;
......
......@@ -207,6 +207,8 @@ sub check_for_dead_workers {
print "====== Live workers according to Queen:".scalar(@$queen_worker_list).", Meadow:".scalar(keys %$worker_status_hash)."\n";
my %gc_wpid_to_worker = ();
foreach my $worker (@$queen_worker_list) {
next unless($meadow->responsible_for_worker($worker));
......@@ -216,13 +218,28 @@ sub check_for_dead_workers {
} else {
$worker_status_summary{'AWOL'}++;
my $cod = $meadow->find_out_cause($worker_pid) || 'FATALITY';
$worker->cause_of_death( $cod );
$self->register_worker_death($worker);
$gc_wpid_to_worker{$worker_pid} = $worker;
}
}
print "\t".join(', ', map { "$_:$worker_status_summary{$_}" } keys %worker_status_summary)."\n\n";
if(my $total_lost = scalar(keys %gc_wpid_to_worker)) {
warn "GarbageCollector: Discovered $total_lost lost workers\n";
my $wpid_to_cod = {};
if(UNIVERSAL::can($meadow, 'find_out_causes')) {
$wpid_to_cod = $meadow->find_out_causes( keys %gc_wpid_to_worker );
my $lost_with_known_cod = scalar(keys %$wpid_to_cod);
warn "GarbageCollector: Found why $lost_with_known_cod of them died\n";
}
warn "GarbageCollector: Releasing the jobs\n";
while(my ($worker_pid, $worker) = each %gc_wpid_to_worker) {
$worker->cause_of_death( $wpid_to_cod->{$worker_pid} || 'FATALITY');
$self->register_worker_death($worker);
}
}
if($check_buried_in_haste) {
print "====== Checking for workers buried in haste... ";
my $buried_in_haste_list = $self->fetch_dead_workers_with_jobs();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment