Commit df8280ab authored by Leo Gordon's avatar Leo Gordon
Browse files

default behaviour on whether to retry failing jobs by default or not is now centrally-controllable

parent ca8a48cb
......@@ -290,6 +290,14 @@ sub last_check_in {
return $self->{'_last_check_in'};
}
# this is a setter/getter that defines default behaviour when a job throws: should it be retried or not?
sub retry_throwing_jobs {
my( $self, $value ) = @_;
$self->{'_retry_throwing_jobs'} = $value if($value);
return $self->{'_retry_throwing_jobs'} || 0;
}
=head2 hive_output_dir
Arg [1] : (optional) string directory path
......@@ -515,7 +523,12 @@ sub run
my $job_status_when_died = $job->status();
warn "Job with id=$job_id died in status '$job_status_when_died' for the following reason: $error_msg\n";
$self->db()->get_JobErrorAdaptor()->register_error($job_id, $error_msg);
if($job->transient_error) {
# If the job specifically said what to do next, respect that last wish.
# Otherwise follow the default behaviour set by the beekeeper in $worker:
#
my $attempt_to_retry_this_job = defined($job->transient_error) ? $job->transient_error : $self->retry_throwing_jobs;
if($attempt_to_retry_this_job) {
$job->adaptor->reset_dead_job_by_dbID($job_id);
} else {
$job->update_status('FAILED');
......
......@@ -61,6 +61,7 @@ sub main {
$self->{'verbose_stats'} = 1;
$self->{'reg_name'} = 'hive';
$self->{'maximise_concurrency'} = 0;
$self->{'retry_throwing_jobs'} = undef;
$self->{'hive_output_dir'} = undef;
GetOptions(
......@@ -94,8 +95,9 @@ sub main {
'batch_size=i' => \$self->{'batch_size'},
'lifespan=i' => \$self->{'lifespan'},
'logic_name=s' => \$self->{'logic_name'},
'maximise_concurrency' => \$self->{'maximise_concurrency'},
'hive_output_dir=s' => \$self->{'hive_output_dir'},
'maximise_concurrency=i' => \$self->{'maximise_concurrency'},
'retry_throwing_jobs=i' => \$self->{'retry_throwing_jobs'},
# other commands/options
'h|help' => \$help,
......@@ -318,11 +320,12 @@ sub generate_worker_cmd {
if ($self->{'run_job_id'}) {
$worker_cmd .= " -job_id ".$self->{'run_job_id'};
} else {
$worker_cmd .= ((defined $self->{'job_limit'}) ? (' -limit ' .$self->{'job_limit'}) : '')
. ((defined $self->{'batch_size'}) ? (' -batch_size '.$self->{'batch_size'}) : '')
. ((defined $self->{'lifespan'}) ? (' -lifespan '.$self->{'lifespan'}) : '')
. ((defined $self->{'logic_name'}) ? (' -logic_name '.$self->{'logic_name'}) : '')
$worker_cmd .= (defined($self->{'job_limit'}) ? " -limit $self->{'job_limit'}" : '')
. (defined($self->{'batch_size'}) ? " -batch_size $self->{'batch_size'}" : '')
. (defined($self->{'lifespan'}) ? " -lifespan $self->{'lifespan'}" : '')
. (defined($self->{'logic_name'}) ? " -logic_name $self->{'logic_name'}" : '')
. ($self->{'maximise_concurrency'} ? ' -maximise_concurrency 1' : '')
. (defined($self->{'retry_throwing_jobs'}) ? " -retry_throwing_jobs $self->{'retry_throwing_jobs'}" : '')
. ($self->{'hive_output_dir'} ? " -hive_output_dir $self->{'hive_output_dir'}" : '');
}
......@@ -500,12 +503,13 @@ __DATA__
=head2 Worker control
-jlimit <num> : #jobs to run before worker can die naturally
-batch_size <num> : #jobs a worker can claim at once
-lifespan <num> : lifespan limit for each worker
-logic_name <string> : restrict the pipeline stat/runs to this analysis logic_name
-maximise_concurrency 1 : try to run more different analyses at the same time
-hive_output_dir <path> : directory where stdout/stderr of the hive is redirected
-jlimit <num> : #jobs to run before worker can die naturally
-batch_size <num> : #jobs a worker can claim at once
-lifespan <num> : lifespan limit for each worker
-logic_name <string> : restrict the pipeline stat/runs to this analysis logic_name
-maximise_concurrency 1 : try to run more different analyses at the same time
-retry_throwing_jobs 0|1 : if a job dies *knowingly*, should we retry it by default?
-hive_output_dir <path> : directory where stdout/stderr of the hive is redirected
=head2 Other commands/options
......
......@@ -40,6 +40,7 @@ $self->{'process_id'} = undef;
$self->{'debug'} = undef;
$self->{'no_write'} = undef;
$self->{'maximise_concurrency'} = undef;
$self->{'retry_throwing_jobs'} = undef;
my $conf_file;
my ($help, $adaptor, $url);
......@@ -75,7 +76,8 @@ GetOptions(
'analysis_stats' => \$self->{'show_analysis_stats'},
'no_write' => \$self->{'no_write'},
'nowrite' => \$self->{'no_write'},
'maximise_concurrency' => \$self->{'maximise_concurrency'},
'maximise_concurrency=i'=> \$self->{'maximise_concurrency'},
'retry_throwing_jobs=i' => \$self->{'retry_throwing_jobs'},
# Other commands
'h|help' => \$help,
......@@ -191,6 +193,9 @@ if($self->{'lifespan'}) {
if($self->{'no_global_cleanup'}) {
$worker->perform_global_cleanup(0);
}
if(defined $self->{'retry_throwing_jobs'}) {
$worker->retry_throwing_jobs($self->{'retry_throwing_jobs'});
}
$worker->print_worker();
......@@ -306,19 +311,20 @@ __DATA__
=head2 Job/Analysis control parameters:
-analysis_id <id> : analysis_id in db
-logic_name <string> : logic_name of analysis to make this worker
-batch_size <num> : #jobs to claim at a time
-limit <num> : #jobs to run before worker can die naturally
-lifespan <num> : number of minutes this worker is allowed to run
-hive_output_dir <path> : directory where stdout/stderr of the hive is redirected
-bk <string> : beekeeper identifier (deprecated and ignored)
-pid <string> : externally set process_id descriptor (e.g. lsf job_id, array_id)
-input_id <string> : test input_id on specified analysis (analysis_id or logic_name)
-job_id <id> : run specific job defined by analysis_job_id
-analysis_stats : show status of each analysis in hive
-no_cleanup : don't perform global_cleanup when worker exits
-no_write : don't write_output or auto_dataflow input_job
-analysis_id <id> : analysis_id in db
-logic_name <string> : logic_name of analysis to make this worker
-batch_size <num> : #jobs to claim at a time
-limit <num> : #jobs to run before worker can die naturally
-lifespan <num> : number of minutes this worker is allowed to run
-hive_output_dir <path> : directory where stdout/stderr of the hive is redirected
-bk <string> : beekeeper identifier (deprecated and ignored)
-pid <string> : externally set process_id descriptor (e.g. lsf job_id, array_id)
-input_id <string> : test input_id on specified analysis (analysis_id or logic_name)
-job_id <id> : run specific job defined by analysis_job_id
-analysis_stats : show status of each analysis in hive
-no_cleanup : don't perform global_cleanup when worker exits
-no_write : don't write_output or auto_dataflow input_job
-retry_throwing_jobs 0|1 : if a job dies *knowingly*, should we retry it by default?
=head2 Other options:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment