Commit f4531084 authored by Javier Herrero's avatar Javier Herrero
Browse files

Implement new max_retry_count and failed_job_tolerance parameters in the analysis_stats table

parent eb9c00c3
......@@ -127,6 +127,20 @@ sub failed_job_count {
return $self->{'_failed_job_count'};
}
sub max_retry_count {
my $self = shift;
$self->{'_max_retry_count'} = shift if(@_);
$self->{'_max_retry_count'} = 3 unless(defined($self->{'_max_retry_count'}));
return $self->{'_max_retry_count'};
}
sub failed_job_tolerance {
my $self = shift;
$self->{'_failed_job_tolerance'} = shift if(@_);
$self->{'_failed_job_tolerance'} = 0 unless(defined($self->{'_failed_job_tolerance'}));
return $self->{'_failed_job_tolerance'};
}
sub running_job_count {
my $self = shift;
return $self->total_job_count
......@@ -164,9 +178,30 @@ sub determine_status {
my $self = shift;
if($self->status ne 'BLOCKED') {
if($self->unclaimed_job_count == 0 and
$self->total_job_count == $self->done_job_count + $self->failed_job_count) {
$self->status('DONE');
if ($self->unclaimed_job_count == 0 and
$self->total_job_count == $self->done_job_count + $self->failed_job_count) {
my $failure_percentage = 0;
if ($self->total_job_count) {
$failure_percentage = $self->failed_job_count * 100 / $self->total_job_count;
}
if ($failure_percentage > $self->failed_job_tolerance) {
$self->status('FAILED');
print
"\n",
"##################################################\n",
"##################################################\n",
"## ##\n";
printf
"## ERROR: %-35s ##\n", $self->get_analysis->logic_name." failed!";
printf
"## %4.1f%% jobs failed (tolerance: %3d%%) ##\n", $failure_percentage, $self->failed_job_tolerance;
print
"## ##\n",
"##################################################\n",
"##################################################\n\n";
} else {
$self->status('DONE');
}
}
if($self->total_job_count == $self->unclaimed_job_count) {
$self->status('READY');
......
......@@ -52,7 +52,7 @@ use Bio::EnsEMBL::Utils::Exception;
our @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
our $max_retry_count = 7;
# our $max_retry_count = 7;
###############################################################################
#
......@@ -501,8 +501,8 @@ sub claim_jobs_for_worker {
Jobs in state CLAIMED as simply reset back to READY.
If jobs was in a 'working' state (GET_INPUT, RUN, WRITE_OUTPUT))
the retry_count is incremented and the status set back to READY.
If the retry_count >= $max_retry_count (7) the job is set to 'FAILED'
and not rerun again.
If the retry_count >= $max_retry_count (3 by default) the job is set
to 'FAILED' and not rerun again.
Exceptions : $worker must be defined
Caller : Bio::EnsEMBL::Hive::Queen
......@@ -516,6 +516,7 @@ sub reset_dead_jobs_for_worker {
#added hive_id index to analysis_job table which made this operation much faster
my ($sql, $sth);
my $max_retry_count = $worker->analysis->stats->max_retry_count();
#first just reset the claimed jobs, these don't need a retry_count index increment
$sql = "UPDATE analysis_job SET job_claim='', status='READY'".
" WHERE status='CLAIMED'".
......@@ -569,19 +570,25 @@ sub reset_dead_job_by_dbID {
# an update with select on status and hive_id took 4seconds per worker to complete,
# while doing a select followed by update on analysis_job_id returned almost instantly
$sql = "UPDATE analysis_job SET job_claim='', status='READY'".
" ,retry_count=retry_count+1".
" WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')".
" AND retry_count<$max_retry_count".
" AND analysis_job_id=$job_id";
$sql = "
UPDATE analysis_job, analysis_stats
SET job_claim='', analysis_job.status='READY', retry_count=retry_count+1
WHERE
analysis_job.status in ('GET_INPUT','RUN','WRITE_OUTPUT')
AND analysis_job.analysis_id = analysis_stats.analysis_id
AND retry_count < max_retry_count
AND analysis_job_id=$job_id";
#print("$sql\n");
$self->dbc->do($sql);
$sql = "UPDATE analysis_job SET status='FAILED'".
" ,retry_count=retry_count+1".
" WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')".
" AND retry_count>=$max_retry_count".
" AND analysis_job_id=$job_id";
$sql = "
UPDATE analysis_job, analysis_stats
SET job_claim='', analysis_job.status='FAILED', retry_count=retry_count+1
WHERE
analysis_job.status in ('GET_INPUT','RUN','WRITE_OUTPUT')
AND analysis_job.analysis_id = analysis_stats.analysis_id
AND retry_count >= max_retry_count
AND analysis_job_id=$job_id";
#print("$sql\n");
$self->dbc->do($sql);
......
......@@ -146,7 +146,9 @@ sub update {
$sql .= ",total_job_count=" . $stats->total_job_count();
$sql .= ",unclaimed_job_count=" . $stats->unclaimed_job_count();
$sql .= ",done_job_count=" . $stats->done_job_count();
$sql .= ",max_retry_count=" . $stats->max_retry_count();
$sql .= ",failed_job_count=" . $stats->failed_job_count();
$sql .= ",failed_job_tolerance=" . $stats->failed_job_tolerance();
$sql .= ",num_required_workers=" . $stats->num_required_workers();
$sql .= ",last_update=NOW()";
$sql .= ",sync_lock=''";
......@@ -313,7 +315,9 @@ sub _columns {
ast.total_job_count
ast.unclaimed_job_count
ast.done_job_count
ast.max_retry_count
ast.failed_job_count
ast.failed_job_tolerance
ast.num_required_workers
ast.last_update
ast.sync_lock
......@@ -342,7 +346,9 @@ sub _objs_from_sth {
$analStats->total_job_count($column{'total_job_count'});
$analStats->unclaimed_job_count($column{'unclaimed_job_count'});
$analStats->done_job_count($column{'done_job_count'});
$analStats->max_retry_count($column{'max_retry_count'});
$analStats->failed_job_count($column{'failed_job_count'});
$analStats->failed_job_tolerance($column{'failed_job_tolerance'});
$analStats->num_required_workers($column{'num_required_workers'});
$analStats->seconds_since_last_update($column{'seconds_since_last_update'});
$analStats->adaptor($self);
......
......@@ -540,6 +540,35 @@ sub check_blocking_control_rules_for_AnalysisStats
}
sub get_num_failed_analyses
{
my $self = shift;
my $analysis = shift;
my $statsDBA = $self->db->get_AnalysisStatsAdaptor;
my $failed_analyses = $statsDBA->fetch_by_status('FAILED');
if ($analysis) {
foreach my $this_failed_analysis (@$failed_analyses) {
if ($this_failed_analysis->analysis_id == $analysis->dbID) {
print "#########################################################\n",
" Too many jobs failed for analysis ".$analysis->logic_name.". FAIL!!\n",
"#########################################################\n\n";
return 1;
}
}
return 0;
}
if (@$failed_analyses) {
print "##################################################\n",
" Too many failed jobs. FAIL!!\n",
"##################################################\n";
}
return scalar(@$failed_analyses);
}
sub get_hive_current_load {
my $self = shift;
my $sql = "SELECT sum(1/analysis_stats.hive_capacity) FROM hive, analysis_stats ".
......@@ -744,6 +773,9 @@ sub _pick_best_analysis_for_new_worker {
}
# ok so no analyses 'need' workers.
if ($self->get_num_failed_analyses()) {
return undef;
}
# see if anything needs an update, in case there are
# hidden jobs that haven't made it into the summary stats
......
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Hive::RunnableDB::Test
=cut
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::DBAdaptor->new($locator);
my $repmask = Bio::EnsEMBL::Hive::RunnableDB::Dummy->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$repmask->fetch_input(); #reads from DB
$repmask->run();
$repmask->output();
$repmask->write_output(); #writes to DB
=cut
=head1 DESCRIPTION
This object is used to test failure of jobs in the hive system.
It is intended for development purposes only!!
It parses the analysis.parameters and analysis_job.input_id as
(string representing) hasrefs and extracts the divisor and the value.
If the modulo (value % divisor) is 0, the job will fail.
=cut
=head1 CONTACT
ensembl-dev@ebi.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package Bio::EnsEMBL::Hive::RunnableDB::Test;
use strict;
use Bio::EnsEMBL::Hive::Process;
our @ISA = qw(Bio::EnsEMBL::Hive::Process);
=head2 fetch_input
Implementation of the Bio::EnsEMBL::Hive::Process interface
=cut
sub fetch_input {
my $self = shift;
$self->get_params($self->parameters);
$self->get_params($self->input_id);
return 1;
}
=head2 run
Implementation of the Bio::EnsEMBL::Hive::Process interface
=cut
sub run
{
my $self = shift;
my $divisor = $self->divisor();
my $value = $self->value();
if (!$divisor or !defined($value)) {
die "Wrong parameters: divisor = $divisor and value = $value\n";
} elsif ($value % $divisor == 0) {
die "$value % $divisor is 0 => die!\n";
}
return 1;
}
=head2 write_output
Implementation of the Bio::EnsEMBL::Hive::Process interface
=cut
sub write_output {
my $self = shift;
return 1;
}
=head2 divisor
Arg [1] : (optional) $divisor
Example : $object->divisor($divisor);
Example : $divisor = $object->divisor();
Description : Getter/setter for the divisor attribute
Returntype :
Exceptions : none
Caller : general
Status : Stable
=cut
sub divisor {
my $self = shift;
if (@_) {
$self->{_divisor} = shift;
}
return $self->{_divisor};
}
=head2 value
Arg [1] : (optional) $value
Example : $object->value($value);
Example : $value = $object->value();
Description : Getter/setter for the value attribute
Returntype :
Exceptions : none
Caller : general
Status : Stable
=cut
sub value {
my $self = shift;
if (@_) {
$self->{_value} = shift;
}
return $self->{_value};
}
=head2 get_params
=cut
sub get_params {
my $self = shift;
my $param_string = shift;
return unless($param_string);
print("parsing parameter string : ",$param_string,"\n");
my $params = eval($param_string);
return unless($params);
if(defined($params->{'divisor'})) {
$self->divisor($params->{'divisor'});
}
if(defined($params->{'value'})) {
$self->value($params->{'value'});
}
}
1;
......@@ -471,11 +471,16 @@ sub run_autonomously {
$queen->synchronize_hive();
}
$count = $queen->get_num_needed_workers($analysis);
if($count==0 && $analysis) {
printf("Nothing left to do for analysis ".$analysis->logic_name.". DONE!!\n\n");
my $num_failed_analyses = $queen->get_num_failed_analyses($analysis);
if ($count==0 && $analysis) {
if (!$num_failed_analyses) {
printf("Nothing left to do for analysis ".$analysis->logic_name.". DONE!!\n\n");
}
$loopit=0;
} elsif ($count == 0) {
printf("Nothing left to do. DONE!!\n\n");
if (!$num_failed_analyses) {
print "Nothing left to do. DONE!!\n\n";
}
$loopit=0;
}
}
......
ALTER TABLE analysis_stats ADD COLUMN max_retry_count int(10) DEFAULT 3 NOT NULL AFTER done_job_count;
ALTER TABLE analysis_stats ADD COLUMN failed_job_tolerance int(10) DEFAULT 0 NOT NULL AFTER max_retry_count;
......@@ -197,12 +197,13 @@ CREATE TABLE analysis_data (
-- when to unblock other analyses. Also provides
--
-- semantics:
-- analysis_id - foreign key to analysis table
-- status - overview status of the analysis_jobs (cached state)
-- analysis_id - foreign key to analysis table
-- status - overview status of the analysis_jobs (cached state)
-- failed_job_tolerance - % of tolerated failed jobs
CREATE TABLE analysis_stats (
analysis_id int(10) NOT NULL,
status enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE')
status enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE', 'FAILED')
DEFAULT 'READY' NOT NULL,
batch_size int(10) default 1 NOT NULL,
avg_msec_per_job int(10) default 0 NOT NULL,
......@@ -210,7 +211,9 @@ CREATE TABLE analysis_stats (
total_job_count int(10) NOT NULL,
unclaimed_job_count int(10) NOT NULL,
done_job_count int(10) NOT NULL,
max_retry_count int(10) default 3 NOT NULL,
failed_job_count int(10) NOT NULL,
failed_job_tolerance int(10) default 0 NOT NULL,
num_required_workers int(10) NOT NULL,
last_update datetime NOT NULL,
sync_lock int(10) default 0 NOT NULL,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment