Commit c9b4cacb authored by Leo Gordon's avatar Leo Gordon
Browse files

try to detect MEMLIMIT or RUNLIMIT states

parent 3f17d66c
......@@ -535,7 +535,7 @@ sub claim_jobs_for_worker {
Description: If a worker has died some of its jobs need to be reset back to 'READY'
so they can be rerun.
Jobs in state CLAIMED as simply reset back to READY.
If jobs was in a 'working' state (GET_INPUT, RUN, WRITE_OUTPUT))
If jobs was in a 'working' state (COMPILATION, GET_INPUT, RUN, WRITE_OUTPUT)
the retry_count is increased and the status set back to READY.
If the retry_count >= $max_retry_count (3 by default) the job is set
to 'FAILED' and not rerun again.
......@@ -545,47 +545,36 @@ sub claim_jobs_for_worker {
=cut
sub reset_dead_jobs_for_worker {
my $self = shift;
my $worker = shift;
throw("must define worker") unless($worker);
my ($self, $worker) = @_;
#added worker_id index to analysis_job table which made this operation much faster
my ($sql, $sth);
my $max_retry_count = $worker->analysis->stats->max_retry_count();
my $worker_id = $worker->worker_id();
#first just reset the claimed jobs, these don't need a retry_count index increment
$sql = "UPDATE analysis_job SET job_claim='', status='READY'".
" WHERE status='CLAIMED'".
" AND worker_id='" . $worker->worker_id ."'";
$sth = $self->prepare($sql);
$sth->execute();
$sth->finish;
#print(" done update CLAIMED\n");
$self->dbc->do( qq{
UPDATE analysis_job
SET job_claim='', status='READY'
WHERE status='CLAIMED'
AND worker_id='$worker_id'
} );
# an update with select on status and worker_id took 4seconds per worker to complete,
# while doing a select followed by update on analysis_job_id returned almost instantly
$sql = "UPDATE analysis_job SET job_claim='', status='READY'".
" ,retry_count=retry_count+1".
" WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')".
" AND retry_count<$max_retry_count".
" AND worker_id='" . $worker->worker_id ."'";
#print("$sql\n");
$sth = $self->prepare($sql);
$sth->execute();
$sth->finish;
$sql = "UPDATE analysis_job SET status='FAILED'".
" ,retry_count=retry_count+1".
" WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')".
" AND retry_count>=$max_retry_count".
" AND worker_id='" . $worker->worker_id ."'";
#print("$sql\n");
$sth = $self->prepare($sql);
$sth->execute();
$sth->finish;
#print(" done update BROKEN jobs\n");
$self->dbc->do( qq{
UPDATE analysis_job SET job_claim='', status='READY', retry_count=retry_count+1
WHERE status in ('COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT')
AND retry_count<$max_retry_count
AND worker_id='$worker_id'
} );
$self->dbc->do( qq{
UPDATE analysis_job SET status='FAILED', retry_count=retry_count+1
WHERE status in ('COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT')
AND retry_count>=$max_retry_count
AND worker_id='$worker_id'
} );
}
......
......@@ -67,6 +67,12 @@ sub kill_worker {
die "Please use a derived method";
}
sub find_out_cause { # parent assumes agnostic stance
my ($self, $worker_pid) = @_;
return;
}
# --------------[(combinable) means of adjusting the number of submitted workers]----------------------
sub total_running_workers_limit { # if set and ->can('count_running_workers'),
......
......@@ -77,6 +77,22 @@ sub kill_worker {
}
}
sub find_out_cause {
my ($self, $worker_pid) = @_;
my $diagnostic_output = `bacct -l $worker_pid`;
if($diagnostic_output=~/TERM_MEMLIMIT: job killed/i) {
return 'MEMLIMIT';
} elsif($diagnostic_output=~/TERM_RUNLIMIT: job killed/i) {
return 'RUNLIMIT';
} elsif($diagnostic_output=~/TERM_OWNER: job killed/i) {
return 'KILLED_BY_USER';
}
return;
}
sub submit_workers {
my ($self, $iteration, $worker_cmd, $worker_count, $rc_id, $rc_parameters) = @_;
......
......@@ -212,7 +212,9 @@ sub check_for_dead_workers {
$worker_status_summary{$status}++;
} else {
$worker_status_summary{'AWOL'}++;
$worker->cause_of_death('FATALITY');
my $cause = $meadow->find_out_cause($worker_pid) || 'FATALITY';
$worker->cause_of_death( $cause );
$self->register_worker_death($worker);
}
}
......
## New states that *may* be detected by timely LSF interrogation.
ALTER TABLE hive MODIFY COLUMN cause_of_death enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'FATALITY') DEFAULT '' NOT NULL;
......@@ -23,7 +23,7 @@ CREATE TABLE hive (
born datetime NOT NULL,
last_check_in datetime NOT NULL,
died datetime DEFAULT NULL,
cause_of_death enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'FATALITY') DEFAULT '' NOT NULL,
cause_of_death enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'FATALITY') DEFAULT '' NOT NULL,
PRIMARY KEY (worker_id),
INDEX analysis_status (analysis_id, status)
) ENGINE=InnoDB;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment