Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Open sidebar
ensembl-gh-mirror
ensembl-hive
Commits
f4531084
Commit
f4531084
authored
Nov 16, 2007
by
Javier Herrero
Browse files
Implement new max_retry_count and failed_job_tolerance parameters in the analysis_stats table
parent
eb9c00c3
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
286 additions
and
22 deletions
+286
-22
modules/Bio/EnsEMBL/Hive/AnalysisStats.pm
modules/Bio/EnsEMBL/Hive/AnalysisStats.pm
+38
-3
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm
+20
-13
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisStatsAdaptor.pm
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisStatsAdaptor.pm
+6
-0
modules/Bio/EnsEMBL/Hive/Queen.pm
modules/Bio/EnsEMBL/Hive/Queen.pm
+32
-0
modules/Bio/EnsEMBL/Hive/RunnableDB/Test.pm
modules/Bio/EnsEMBL/Hive/RunnableDB/Test.pm
+174
-0
scripts/beekeeper.pl
scripts/beekeeper.pl
+8
-3
sql/patch_2007-11-16.sql
sql/patch_2007-11-16.sql
+2
-0
sql/tables.sql
sql/tables.sql
+6
-3
No files found.
modules/Bio/EnsEMBL/Hive/AnalysisStats.pm
View file @
f4531084
...
...
@@ -127,6 +127,20 @@ sub failed_job_count {
return
$self
->
{'
_failed_job_count
'};
}
sub
max_retry_count
{
my
$self
=
shift
;
$self
->
{'
_max_retry_count
'}
=
shift
if
(
@
_
);
$self
->
{'
_max_retry_count
'}
=
3
unless
(
defined
(
$self
->
{'
_max_retry_count
'}));
return
$self
->
{'
_max_retry_count
'};
}
sub
failed_job_tolerance
{
my
$self
=
shift
;
$self
->
{'
_failed_job_tolerance
'}
=
shift
if
(
@
_
);
$self
->
{'
_failed_job_tolerance
'}
=
0
unless
(
defined
(
$self
->
{'
_failed_job_tolerance
'}));
return
$self
->
{'
_failed_job_tolerance
'};
}
sub
running_job_count
{
my
$self
=
shift
;
return
$self
->
total_job_count
...
...
@@ -164,9 +178,30 @@ sub determine_status {
my
$self
=
shift
;
if
(
$self
->
status
ne
'
BLOCKED
')
{
if
(
$self
->
unclaimed_job_count
==
0
and
$self
->
total_job_count
==
$self
->
done_job_count
+
$self
->
failed_job_count
)
{
$self
->
status
('
DONE
');
if
(
$self
->
unclaimed_job_count
==
0
and
$self
->
total_job_count
==
$self
->
done_job_count
+
$self
->
failed_job_count
)
{
my
$failure_percentage
=
0
;
if
(
$self
->
total_job_count
)
{
$failure_percentage
=
$self
->
failed_job_count
*
100
/
$self
->
total_job_count
;
}
if
(
$failure_percentage
>
$self
->
failed_job_tolerance
)
{
$self
->
status
('
FAILED
');
print
"
\n
",
"
##################################################
\n
",
"
##################################################
\n
",
"
## ##
\n
";
printf
"
## ERROR: %-35s ##
\n
",
$self
->
get_analysis
->
logic_name
.
"
failed!
";
printf
"
## %4.1f%% jobs failed (tolerance: %3d%%) ##
\n
",
$failure_percentage
,
$self
->
failed_job_tolerance
;
print
"
## ##
\n
",
"
##################################################
\n
",
"
##################################################
\n\n
";
}
else
{
$self
->
status
('
DONE
');
}
}
if
(
$self
->
total_job_count
==
$self
->
unclaimed_job_count
)
{
$self
->
status
('
READY
');
...
...
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm
View file @
f4531084
...
...
@@ -52,7 +52,7 @@ use Bio::EnsEMBL::Utils::Exception;
our
@ISA
=
qw(Bio::EnsEMBL::DBSQL::BaseAdaptor)
;
our
$max_retry_count
=
7
;
#
our $max_retry_count = 7;
###############################################################################
#
...
...
@@ -501,8 +501,8 @@ sub claim_jobs_for_worker {
Jobs in state CLAIMED as simply reset back to READY.
If jobs was in a 'working' state (GET_INPUT, RUN, WRITE_OUTPUT))
the retry_count is incremented and the status set back to READY.
If the retry_count >= $max_retry_count (
7
) the job is set
to 'FAILED'
and not rerun again.
If the retry_count >= $max_retry_count (
3 by default
) the job is set
to 'FAILED'
and not rerun again.
Exceptions : $worker must be defined
Caller : Bio::EnsEMBL::Hive::Queen
...
...
@@ -516,6 +516,7 @@ sub reset_dead_jobs_for_worker {
#added hive_id index to analysis_job table which made this operation much faster
my
(
$sql
,
$sth
);
my
$max_retry_count
=
$worker
->
analysis
->
stats
->
max_retry_count
();
#first just reset the claimed jobs, these don't need a retry_count index increment
$sql
=
"
UPDATE analysis_job SET job_claim='', status='READY'
"
.
"
WHERE status='CLAIMED'
"
.
...
...
@@ -569,19 +570,25 @@ sub reset_dead_job_by_dbID {
# an update with select on status and hive_id took 4seconds per worker to complete,
# while doing a select followed by update on analysis_job_id returned almost instantly
$sql
=
"
UPDATE analysis_job SET job_claim='', status='READY'
"
.
"
,retry_count=retry_count+1
"
.
"
WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')
"
.
"
AND retry_count<
$max_retry_count
"
.
"
AND analysis_job_id=
$job_id
";
$sql
=
"
UPDATE analysis_job, analysis_stats
SET job_claim='', analysis_job.status='READY', retry_count=retry_count+1
WHERE
analysis_job.status in ('GET_INPUT','RUN','WRITE_OUTPUT')
AND analysis_job.analysis_id = analysis_stats.analysis_id
AND retry_count < max_retry_count
AND analysis_job_id=
$job_id
";
#print("$sql\n");
$self
->
dbc
->
do
(
$sql
);
$sql
=
"
UPDATE analysis_job SET status='FAILED'
"
.
"
,retry_count=retry_count+1
"
.
"
WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')
"
.
"
AND retry_count>=
$max_retry_count
"
.
"
AND analysis_job_id=
$job_id
";
$sql
=
"
UPDATE analysis_job, analysis_stats
SET job_claim='', analysis_job.status='FAILED', retry_count=retry_count+1
WHERE
analysis_job.status in ('GET_INPUT','RUN','WRITE_OUTPUT')
AND analysis_job.analysis_id = analysis_stats.analysis_id
AND retry_count >= max_retry_count
AND analysis_job_id=
$job_id
";
#print("$sql\n");
$self
->
dbc
->
do
(
$sql
);
...
...
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisStatsAdaptor.pm
View file @
f4531084
...
...
@@ -146,7 +146,9 @@ sub update {
$sql
.=
"
,total_job_count=
"
.
$stats
->
total_job_count
();
$sql
.=
"
,unclaimed_job_count=
"
.
$stats
->
unclaimed_job_count
();
$sql
.=
"
,done_job_count=
"
.
$stats
->
done_job_count
();
$sql
.=
"
,max_retry_count=
"
.
$stats
->
max_retry_count
();
$sql
.=
"
,failed_job_count=
"
.
$stats
->
failed_job_count
();
$sql
.=
"
,failed_job_tolerance=
"
.
$stats
->
failed_job_tolerance
();
$sql
.=
"
,num_required_workers=
"
.
$stats
->
num_required_workers
();
$sql
.=
"
,last_update=NOW()
";
$sql
.=
"
,sync_lock=''
";
...
...
@@ -313,7 +315,9 @@ sub _columns {
ast
.
total_job_count
ast
.
unclaimed_job_count
ast
.
done_job_count
ast
.
max_retry_count
ast
.
failed_job_count
ast
.
failed_job_tolerance
ast
.
num_required_workers
ast
.
last_update
ast
.
sync_lock
...
...
@@ -342,7 +346,9 @@ sub _objs_from_sth {
$analStats
->
total_job_count
(
$column
{'
total_job_count
'});
$analStats
->
unclaimed_job_count
(
$column
{'
unclaimed_job_count
'});
$analStats
->
done_job_count
(
$column
{'
done_job_count
'});
$analStats
->
max_retry_count
(
$column
{'
max_retry_count
'});
$analStats
->
failed_job_count
(
$column
{'
failed_job_count
'});
$analStats
->
failed_job_tolerance
(
$column
{'
failed_job_tolerance
'});
$analStats
->
num_required_workers
(
$column
{'
num_required_workers
'});
$analStats
->
seconds_since_last_update
(
$column
{'
seconds_since_last_update
'});
$analStats
->
adaptor
(
$self
);
...
...
modules/Bio/EnsEMBL/Hive/Queen.pm
View file @
f4531084
...
...
@@ -540,6 +540,35 @@ sub check_blocking_control_rules_for_AnalysisStats
}
sub
get_num_failed_analyses
{
my
$self
=
shift
;
my
$analysis
=
shift
;
my
$statsDBA
=
$self
->
db
->
get_AnalysisStatsAdaptor
;
my
$failed_analyses
=
$statsDBA
->
fetch_by_status
('
FAILED
');
if
(
$analysis
)
{
foreach
my
$this_failed_analysis
(
@$failed_analyses
)
{
if
(
$this_failed_analysis
->
analysis_id
==
$analysis
->
dbID
)
{
print
"
#########################################################
\n
",
"
Too many jobs failed for analysis
"
.
$analysis
->
logic_name
.
"
. FAIL!!
\n
",
"
#########################################################
\n\n
";
return
1
;
}
}
return
0
;
}
if
(
@$failed_analyses
)
{
print
"
##################################################
\n
",
"
Too many failed jobs. FAIL!!
\n
",
"
##################################################
\n
";
}
return
scalar
(
@$failed_analyses
);
}
sub
get_hive_current_load
{
my
$self
=
shift
;
my
$sql
=
"
SELECT sum(1/analysis_stats.hive_capacity) FROM hive, analysis_stats
"
.
...
...
@@ -744,6 +773,9 @@ sub _pick_best_analysis_for_new_worker {
}
# ok so no analyses 'need' workers.
if
(
$self
->
get_num_failed_analyses
())
{
return
undef
;
}
# see if anything needs an update, in case there are
# hidden jobs that haven't made it into the summary stats
...
...
modules/Bio/EnsEMBL/Hive/RunnableDB/Test.pm
0 → 100644
View file @
f4531084
#
# You may distribute this module under the same terms as perl itself
#
# POD documentation - main docs before the code
=pod
=head1 NAME
Bio::EnsEMBL::Hive::RunnableDB::Test
=cut
=head1 SYNOPSIS
my $db = Bio::EnsEMBL::DBAdaptor->new($locator);
my $repmask = Bio::EnsEMBL::Hive::RunnableDB::Dummy->new (
-db => $db,
-input_id => $input_id
-analysis => $analysis );
$repmask->fetch_input(); #reads from DB
$repmask->run();
$repmask->output();
$repmask->write_output(); #writes to DB
=cut
=head1 DESCRIPTION
This object is used to test failure of jobs in the hive system.
It is intended for development purposes only!!
It parses the analysis.parameters and analysis_job.input_id as
(string representing) hasrefs and extracts the divisor and the value.
If the modulo (value % divisor) is 0, the job will fail.
=cut
=head1 CONTACT
ensembl-dev@ebi.ac.uk
=cut
=head1 APPENDIX
The rest of the documentation details each of the object methods.
Internal methods are usually preceded with a _
=cut
package
Bio::EnsEMBL::Hive::RunnableDB::
Test
;
use
strict
;
use
Bio::EnsEMBL::Hive::
Process
;
our
@ISA
=
qw(Bio::EnsEMBL::Hive::Process)
;
=head2 fetch_input
Implementation of the Bio::EnsEMBL::Hive::Process interface
=cut
sub
fetch_input
{
my
$self
=
shift
;
$self
->
get_params
(
$self
->
parameters
);
$self
->
get_params
(
$self
->
input_id
);
return
1
;
}
=head2 run
Implementation of the Bio::EnsEMBL::Hive::Process interface
=cut
sub
run
{
my
$self
=
shift
;
my
$divisor
=
$self
->
divisor
();
my
$value
=
$self
->
value
();
if
(
!
$divisor
or
!
defined
(
$value
))
{
die
"
Wrong parameters: divisor =
$divisor
and value =
$value
\n
";
}
elsif
(
$value
%
$divisor
==
0
)
{
die
"
$value
%
$divisor
is 0 => die!
\n
";
}
return
1
;
}
=head2 write_output
Implementation of the Bio::EnsEMBL::Hive::Process interface
=cut
sub
write_output
{
my
$self
=
shift
;
return
1
;
}
=head2 divisor
Arg [1] : (optional) $divisor
Example : $object->divisor($divisor);
Example : $divisor = $object->divisor();
Description : Getter/setter for the divisor attribute
Returntype :
Exceptions : none
Caller : general
Status : Stable
=cut
sub
divisor
{
my
$self
=
shift
;
if
(
@
_
)
{
$self
->
{
_divisor
}
=
shift
;
}
return
$self
->
{
_divisor
};
}
=head2 value
Arg [1] : (optional) $value
Example : $object->value($value);
Example : $value = $object->value();
Description : Getter/setter for the value attribute
Returntype :
Exceptions : none
Caller : general
Status : Stable
=cut
sub
value
{
my
$self
=
shift
;
if
(
@
_
)
{
$self
->
{
_value
}
=
shift
;
}
return
$self
->
{
_value
};
}
=head2 get_params
=cut
sub
get_params
{
my
$self
=
shift
;
my
$param_string
=
shift
;
return
unless
(
$param_string
);
print
("
parsing parameter string :
",
$param_string
,"
\n
");
my
$params
=
eval
(
$param_string
);
return
unless
(
$params
);
if
(
defined
(
$params
->
{'
divisor
'}))
{
$self
->
divisor
(
$params
->
{'
divisor
'});
}
if
(
defined
(
$params
->
{'
value
'}))
{
$self
->
value
(
$params
->
{'
value
'});
}
}
1
;
scripts/beekeeper.pl
View file @
f4531084
...
...
@@ -471,11 +471,16 @@ sub run_autonomously {
$queen
->
synchronize_hive
();
}
$count
=
$queen
->
get_num_needed_workers
(
$analysis
);
if
(
$count
==
0
&&
$analysis
)
{
printf
("
Nothing left to do for analysis
"
.
$analysis
->
logic_name
.
"
. DONE!!
\n\n
");
my
$num_failed_analyses
=
$queen
->
get_num_failed_analyses
(
$analysis
);
if
(
$count
==
0
&&
$analysis
)
{
if
(
!
$num_failed_analyses
)
{
printf
("
Nothing left to do for analysis
"
.
$analysis
->
logic_name
.
"
. DONE!!
\n\n
");
}
$loopit
=
0
;
}
elsif
(
$count
==
0
)
{
printf
("
Nothing left to do. DONE!!
\n\n
");
if
(
!
$num_failed_analyses
)
{
print
"
Nothing left to do. DONE!!
\n\n
";
}
$loopit
=
0
;
}
}
...
...
sql/patch_2007-11-16.sql
0 → 100644
View file @
f4531084
ALTER
TABLE
analysis_stats
ADD
COLUMN
max_retry_count
int
(
10
)
DEFAULT
3
NOT
NULL
AFTER
done_job_count
;
ALTER
TABLE
analysis_stats
ADD
COLUMN
failed_job_tolerance
int
(
10
)
DEFAULT
0
NOT
NULL
AFTER
max_retry_count
;
sql/tables.sql
View file @
f4531084
...
...
@@ -197,12 +197,13 @@ CREATE TABLE analysis_data (
-- when to unblock other analyses. Also provides
--
-- semantics:
-- analysis_id - foreign key to analysis table
-- status - overview status of the analysis_jobs (cached state)
-- analysis_id - foreign key to analysis table
-- status - overview status of the analysis_jobs (cached state)
-- failed_job_tolerance - % of tolerated failed jobs
CREATE
TABLE
analysis_stats
(
analysis_id
int
(
10
)
NOT
NULL
,
status
enum
(
'BLOCKED'
,
'LOADING'
,
'SYNCHING'
,
'READY'
,
'WORKING'
,
'ALL_CLAIMED'
,
'DONE'
)
status
enum
(
'BLOCKED'
,
'LOADING'
,
'SYNCHING'
,
'READY'
,
'WORKING'
,
'ALL_CLAIMED'
,
'DONE'
,
'FAILED'
)
DEFAULT
'READY'
NOT
NULL
,
batch_size
int
(
10
)
default
1
NOT
NULL
,
avg_msec_per_job
int
(
10
)
default
0
NOT
NULL
,
...
...
@@ -210,7 +211,9 @@ CREATE TABLE analysis_stats (
total_job_count
int
(
10
)
NOT
NULL
,
unclaimed_job_count
int
(
10
)
NOT
NULL
,
done_job_count
int
(
10
)
NOT
NULL
,
max_retry_count
int
(
10
)
default
3
NOT
NULL
,
failed_job_count
int
(
10
)
NOT
NULL
,
failed_job_tolerance
int
(
10
)
default
0
NOT
NULL
,
num_required_workers
int
(
10
)
NOT
NULL
,
last_update
datetime
NOT
NULL
,
sync_lock
int
(
10
)
default
0
NOT
NULL
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment