Commit 230227dc authored by Leo Gordon's avatar Leo Gordon
Browse files

capture Worker's death message during the new 'SPECIALIZATION' status in...

capture Worker's death message during the new 'SPECIALIZATION' status in job_message/msg (thanks, Thomas!)
parent c6ecd991
...@@ -240,6 +240,7 @@ sub specialize_new_worker { ...@@ -240,6 +240,7 @@ sub specialize_new_worker {
unless($special_batch or $force) { # do we really need to run this analysis? unless($special_batch or $force) { # do we really need to run this analysis?
if($self->get_hive_current_load() >= 1.1) { if($self->get_hive_current_load() >= 1.1) {
$worker->cause_of_death('HIVE_OVERLOAD');
die "Hive is overloaded, can't specialize a worker"; die "Hive is overloaded, can't specialize a worker";
} }
if($stats->status eq 'BLOCKED') { if($stats->status eq 'BLOCKED') {
......
...@@ -514,11 +514,22 @@ sub run { ...@@ -514,11 +514,22 @@ sub run {
$self->get_stderr_redirector->push( $worker_log_dir.'/worker.err' ); $self->get_stderr_redirector->push( $worker_log_dir.'/worker.err' );
} }
eval {
$self->enter_status('SPECIALIZATION');
$self->adaptor->specialize_new_worker( $self, @spec_args ); $self->adaptor->specialize_new_worker( $self, @spec_args );
$self->print_worker(); $self->print_worker();
1;
} or do {
my $msg = "Could not specialize worker:\n\t".$@;
warn "$msg\n";
$self->adaptor->db->get_JobMessageAdaptor()->store_worker_message($self->dbID, $msg, 1 );
if( $self->compile_module_once() ) { $self->cause_of_death('SEE_MSG') unless($self->cause_of_death()); # some specific causes could have been set prior to die "...";
};
my $min_batch_time;
if(!$self->cause_of_death() and $self->compile_module_once() ) {
eval { eval {
$self->enter_status('COMPILATION'); $self->enter_status('COMPILATION');
my $runnable_object = $self->analysis->process or die "Unknown compilation error"; my $runnable_object = $self->analysis->process or die "Unknown compilation error";
...@@ -529,19 +540,19 @@ sub run { ...@@ -529,19 +540,19 @@ sub run {
$self->runnable_object( $runnable_object ); $self->runnable_object( $runnable_object );
$self->enter_status('READY'); $self->enter_status('READY');
$min_batch_time = $self->analysis->stats->min_batch_time();
$self->adaptor->db->dbc->disconnect_when_inactive(0);
1; 1;
} or do { } or do {
my $msg = "Could not compile Runnable '".$self->analysis->module."' :\n\t".$@; my $msg = "Could not compile Runnable '".$self->analysis->module."' :\n\t".$@;
warn "$msg\n"; warn "$msg\n";
$self->adaptor->db->get_JobMessageAdaptor()->store_worker_message($self->dbID, $msg, 1 ); $self->adaptor->db->get_JobMessageAdaptor()->store_worker_message($self->dbID, $msg, 1 );
$self->cause_of_death('CONTAMINATED'); $self->cause_of_death('SEE_MSG');
} };
} }
$self->adaptor->db->dbc->disconnect_when_inactive(0);
my $min_batch_time = $self->analysis->stats->min_batch_time();
my $job_adaptor = $self->adaptor->db->get_AnalysisJobAdaptor; my $job_adaptor = $self->adaptor->db->get_AnalysisJobAdaptor;
while (!$self->cause_of_death) { # Worker's lifespan loop (ends only when the worker dies for any reason) while (!$self->cause_of_death) { # Worker's lifespan loop (ends only when the worker dies for any reason)
......
# add 'SPECIALIZATION' both to worker.status and job_message.status (as job_message now also records messages from jobless workers) :
ALTER TABLE worker MODIFY COLUMN status enum('SPECIALIZATION','COMPILATION','READY','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','DEAD') DEFAULT 'READY' NOT NULL;
ALTER TABLE job_message MODIFY COLUMN status enum('UNKNOWN','SPECIALIZATION','COMPILATION','READY','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','PASSED_ON') DEFAULT 'UNKNOWN';
# add 'SEE_MSG' cause_of_death for causes that cannot be expressed in one word:
ALTER TABLE worker MODIFY COLUMN cause_of_death enum('NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'SEE_MSG', 'UNKNOWN') DEFAULT NULL;
...@@ -80,11 +80,11 @@ CREATE TABLE worker ( ...@@ -80,11 +80,11 @@ CREATE TABLE worker (
analysis_id int(10) unsigned DEFAULT NULL, analysis_id int(10) unsigned DEFAULT NULL,
work_done int(11) DEFAULT '0' NOT NULL, work_done int(11) DEFAULT '0' NOT NULL,
status enum('READY','COMPILATION','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','DEAD') DEFAULT 'READY' NOT NULL, status enum('SPECIALIZATION','COMPILATION','READY','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','DEAD') DEFAULT 'READY' NOT NULL,
born timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, born timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
last_check_in datetime NOT NULL, last_check_in datetime NOT NULL,
died datetime DEFAULT NULL, died datetime DEFAULT NULL,
cause_of_death enum('NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'UNKNOWN') DEFAULT NULL, cause_of_death enum('NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'SEE_MSG', 'UNKNOWN') DEFAULT NULL,
log_dir varchar(255) DEFAULT NULL, log_dir varchar(255) DEFAULT NULL,
PRIMARY KEY (worker_id), PRIMARY KEY (worker_id),
...@@ -221,11 +221,11 @@ CREATE TABLE job ( ...@@ -221,11 +221,11 @@ CREATE TABLE job (
-- --
-- semantics: -- semantics:
-- job_message_id - an autoincremented primary id of the message -- job_message_id - an autoincremented primary id of the message
-- job_id - the id of the job that threw the message -- job_id - the id of the job that threw the message (or NULL if it was outside of a message)
-- worker_id - the worker in charge of the job at the moment -- worker_id - the 'current' worker
-- time - when the message was thrown -- time - when the message was thrown
-- retry - retry_count of the job when the message was thrown -- retry - retry_count of the job when the message was thrown (or NULL if no job)
-- status - of the job when the message was thrown -- status - of the job or worker when the message was thrown
-- msg - string that contains the message -- msg - string that contains the message
-- is_error - binary flag -- is_error - binary flag
...@@ -235,7 +235,7 @@ CREATE TABLE job_message ( ...@@ -235,7 +235,7 @@ CREATE TABLE job_message (
worker_id int(10) unsigned NOT NULL, worker_id int(10) unsigned NOT NULL,
time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
retry int(10) DEFAULT NULL, retry int(10) DEFAULT NULL,
status enum('UNKNOWN','COMPILATION','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','PASSED_ON') DEFAULT 'UNKNOWN', status enum('UNKNOWN','SPECIALIZATION','COMPILATION','READY','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','PASSED_ON') DEFAULT 'UNKNOWN',
msg text, msg text,
is_error TINYINT, is_error TINYINT,
......
...@@ -73,11 +73,11 @@ CREATE TABLE worker ( ...@@ -73,11 +73,11 @@ CREATE TABLE worker (
analysis_id INTEGER DEFAULT NULL, analysis_id INTEGER DEFAULT NULL,
work_done int(11) DEFAULT '0' NOT NULL, work_done int(11) DEFAULT '0' NOT NULL,
status TEXT DEFAULT 'READY' NOT NULL, /* enum('READY','COMPILATION','FETCH_INPUT','RUN','WRITE_OUTPUT','DEAD') DEFAULT 'READY' NOT NULL, */ status TEXT DEFAULT 'READY' NOT NULL, /* enum('SPECIALIZATION','COMPILATION','READY','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','DEAD') DEFAULT 'READY' NOT NULL */
born timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, born timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
last_check_in datetime NOT NULL, last_check_in datetime NOT NULL,
died datetime DEFAULT NULL, died datetime DEFAULT NULL,
cause_of_death TEXT DEFAULT NULL, /* enum('NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'UNKNOWN') DEFAULT NULL */ cause_of_death TEXT DEFAULT NULL, /* enum('NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'SEE_MSG', 'UNKNOWN') DEFAULT NULL */
log_dir varchar(80) DEFAULT NULL log_dir varchar(80) DEFAULT NULL
); );
...@@ -205,11 +205,11 @@ CREATE INDEX IF NOT EXISTS worker_idx ON job (worker_id); ...@@ -205,11 +205,11 @@ CREATE INDEX IF NOT EXISTS worker_idx ON job (worker_id);
-- --
-- semantics: -- semantics:
-- job_message_id - an autoincremented primary id of the message -- job_message_id - an autoincremented primary id of the message
-- job_id - the id of the job that threw the message -- job_id - the id of the job that threw the message (or NULL if it was outside of a message)
-- worker_id - the worker in charge of the job at the moment -- worker_id - the 'current' worker
-- time - when the message was thrown -- time - when the message was thrown
-- retry - retry_count of the job when the message was thrown -- retry - retry_count of the job when the message was thrown (or NULL if no job)
-- status - of the job when the message was thrown -- status - of the job or worker when the message was thrown
-- msg - string that contains the message -- msg - string that contains the message
-- is_error - binary flag -- is_error - binary flag
...@@ -219,7 +219,7 @@ CREATE TABLE job_message ( ...@@ -219,7 +219,7 @@ CREATE TABLE job_message (
worker_id INTEGER NOT NULL, worker_id INTEGER NOT NULL,
time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
retry int(10) DEFAULT NULL, retry int(10) DEFAULT NULL,
status TEXT DEFAULT 'UNKNOWN', /* enum('UNKNOWN', 'COMPILATION', 'FETCH_INPUT', 'RUN', 'WRITE_OUTPUT', 'PASSED_ON') DEFAULT 'UNKNOWN', */ status TEXT DEFAULT 'UNKNOWN', /* enum('UNKNOWN','SPECIALIZATION','COMPILATION','READY','PRE_CLEANUP','FETCH_INPUT','RUN','WRITE_OUTPUT','POST_CLEANUP','PASSED_ON') DEFAULT 'UNKNOWN' */
msg TEXT, msg TEXT,
is_error BOOLEAN is_error BOOLEAN
); );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment