Commit 3f17d66c authored by Leo Gordon's avatar Leo Gordon
Browse files

The system must be able to differentiate between user- and farm-sanctioned killing of workers

parent f486d5ba
......@@ -164,9 +164,7 @@ sub register_worker_death {
return unless($worker);
# if called without a defined cause_of_death, assume catastrophic failure
$worker->cause_of_death('FATALITY') unless(defined($worker->cause_of_death));
unless ($worker->cause_of_death() eq "HIVE_OVERLOAD") {
unless ($worker->cause_of_death() eq 'HIVE_OVERLOAD') {
## HIVE_OVERLOAD occurs after a successful update of the analysis_stats teble. (c.f. Worker.pm)
$worker->analysis->stats->adaptor->decrease_running_workers($worker->analysis->stats->analysis_id);
}
......@@ -181,11 +179,10 @@ sub register_worker_death {
$sth->execute();
$sth->finish;
if($worker->cause_of_death eq "NO_WORK") {
$self->db->get_AnalysisStatsAdaptor->update_status($worker->analysis->dbID, "ALL_CLAIMED");
if($worker->cause_of_death eq 'NO_WORK') {
$self->db->get_AnalysisStatsAdaptor->update_status($worker->analysis->dbID, 'ALL_CLAIMED');
}
if($worker->cause_of_death eq "FATALITY") {
#print("FATAL DEATH Arrrrgggghhhhhhhh (worker_id=",$worker->worker_id,")\n");
if($worker->cause_of_death eq 'FATALITY') {
$self->db->get_AnalysisJobAdaptor->reset_dead_jobs_for_worker($worker);
}
......@@ -215,6 +212,7 @@ sub check_for_dead_workers {
$worker_status_summary{$status}++;
} else {
$worker_status_summary{'AWOL'}++;
$worker->cause_of_death('FATALITY');
$self->register_worker_death($worker);
}
}
......
......@@ -612,8 +612,6 @@ sub run
}
} while (!$self->cause_of_death); # /Worker's lifespan loop
$self->queen->dbc->do("UPDATE hive SET status = 'DEAD' WHERE worker_id = ".$self->worker_id);
if($self->perform_global_cleanup) {
#have runnable cleanup any global/process files/data it may have created
$self->cleanup_worker_process_temp_directory;
......
......@@ -200,7 +200,9 @@ sub main {
$worker->analysis->logic_name, $worker->analysis->dbID);
$self->{'meadow'}->kill_worker($worker);
$worker->cause_of_death('KILLED_BY_USER');
$queen->register_worker_death($worker);
# what about clean-up? Should we do it here or not?
}
}
......
......@@ -198,33 +198,14 @@ if(defined $self->{'retry_throwing_jobs'}) {
}
$worker->print_worker();
my $return_value = 0;
eval { $worker->run(); };
if($@) {
# try to capture it ASAP:
$return_value = ($! || $?>>8 || 1);
#worker threw an exception so it had a problem:
if($worker->perform_global_cleanup) {
#have runnable cleanup any global/process files/data it may have created
$worker->cleanup_worker_process_temp_directory;
}
print("\n$@");
$queen->register_worker_death($worker);
}
$worker->run();
if($self->{'show_analysis_stats'}) {
$queen->print_analysis_status;
$queen->get_num_needed_workers(); # apparently run not for the return value, but for the side-effects
}
printf("dbc %d disconnect cycles\n", $DBA->dbc->disconnect_count);
print("total jobs completes : ", $worker->work_done, "\n");
exit($return_value);
exit 0;
#######################
#
......
## By adding the 'KILLED_BY_USER' cause_of_death we make it clear that the only case of 'FATALITY' is when the process gets lost on or killed by the farm.
ALTER TABLE hive MODIFY COLUMN cause_of_death enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'FATALITY') DEFAULT '' NOT NULL;
......@@ -23,7 +23,7 @@ CREATE TABLE hive (
born datetime NOT NULL,
last_check_in datetime NOT NULL,
died datetime DEFAULT NULL,
cause_of_death enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'FATALITY') DEFAULT '' NOT NULL,
cause_of_death enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'FATALITY') DEFAULT '' NOT NULL,
PRIMARY KEY (worker_id),
INDEX analysis_status (analysis_id, status)
) ENGINE=InnoDB;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment