Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ensembl-gh-mirror
ensembl-hive
Commits
c9b4cacb
Commit
c9b4cacb
authored
Sep 01, 2010
by
Leo Gordon
Browse files
try to detect MEMLIMIT or RUNLIMIT states
parent
3f17d66c
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
54 additions
and
36 deletions
+54
-36
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm
+23
-34
modules/Bio/EnsEMBL/Hive/Meadow.pm
modules/Bio/EnsEMBL/Hive/Meadow.pm
+6
-0
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
+16
-0
modules/Bio/EnsEMBL/Hive/Queen.pm
modules/Bio/EnsEMBL/Hive/Queen.pm
+3
-1
sql/patch_2010-09-01.sql
sql/patch_2010-09-01.sql
+5
-0
sql/tables.sql
sql/tables.sql
+1
-1
No files found.
modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm
View file @
c9b4cacb
...
...
@@ -535,7 +535,7 @@ sub claim_jobs_for_worker {
Description: If a worker has died some of its jobs need to be reset back to 'READY'
so they can be rerun.
Jobs in state CLAIMED as simply reset back to READY.
If jobs was in a 'working' state (GET_INPUT, RUN, WRITE_OUTPUT)
)
If jobs was in a 'working' state (
COMPILATION,
GET_INPUT, RUN, WRITE_OUTPUT)
the retry_count is increased and the status set back to READY.
If the retry_count >= $max_retry_count (3 by default) the job is set
to 'FAILED' and not rerun again.
...
...
@@ -545,47 +545,36 @@ sub claim_jobs_for_worker {
=cut
sub
reset_dead_jobs_for_worker
{
my
$self
=
shift
;
my
$worker
=
shift
;
throw
("
must define worker
")
unless
(
$worker
);
my
(
$self
,
$worker
)
=
@_
;
#added worker_id index to analysis_job table which made this operation much faster
my
(
$sql
,
$sth
);
my
$max_retry_count
=
$worker
->
analysis
->
stats
->
max_retry_count
();
my
$worker_id
=
$worker
->
worker_id
();
#first just reset the claimed jobs, these don't need a retry_count index increment
$sql
=
"
UPDATE analysis_job SET job_claim='', status='READY'
"
.
"
WHERE status='CLAIMED'
"
.
"
AND worker_id='
"
.
$worker
->
worker_id
.
"
'
";
$sth
=
$self
->
prepare
(
$sql
);
$sth
->
execute
();
$sth
->
finish
;
#print(" done update CLAIMED\n");
$self
->
dbc
->
do
(
qq{
UPDATE analysis_job
SET job_claim='', status='READY'
WHERE status='CLAIMED'
AND worker_id='$worker_id'
}
);
# an update with select on status and worker_id took 4seconds per worker to complete,
# while doing a select followed by update on analysis_job_id returned almost instantly
$sql
=
"
UPDATE analysis_job SET job_claim='', status='READY'
"
.
"
,retry_count=retry_count+1
"
.
"
WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')
"
.
"
AND retry_count<
$max_retry_count
"
.
"
AND worker_id='
"
.
$worker
->
worker_id
.
"
'
";
#print("$sql\n");
$sth
=
$self
->
prepare
(
$sql
);
$sth
->
execute
();
$sth
->
finish
;
$sql
=
"
UPDATE analysis_job SET status='FAILED'
"
.
"
,retry_count=retry_count+1
"
.
"
WHERE status in ('GET_INPUT','RUN','WRITE_OUTPUT')
"
.
"
AND retry_count>=
$max_retry_count
"
.
"
AND worker_id='
"
.
$worker
->
worker_id
.
"
'
";
#print("$sql\n");
$sth
=
$self
->
prepare
(
$sql
);
$sth
->
execute
();
$sth
->
finish
;
#print(" done update BROKEN jobs\n");
$self
->
dbc
->
do
(
qq{
UPDATE analysis_job SET job_claim='', status='READY', retry_count=retry_count+1
WHERE status in ('COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT')
AND retry_count<$max_retry_count
AND worker_id='$worker_id'
}
);
$self
->
dbc
->
do
(
qq{
UPDATE analysis_job SET status='FAILED', retry_count=retry_count+1
WHERE status in ('COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT')
AND retry_count>=$max_retry_count
AND worker_id='$worker_id'
}
);
}
...
...
modules/Bio/EnsEMBL/Hive/Meadow.pm
View file @
c9b4cacb
...
...
@@ -67,6 +67,12 @@ sub kill_worker {
die
"
Please use a derived method
";
}
sub
find_out_cause
{
# parent assumes agnostic stance
my
(
$self
,
$worker_pid
)
=
@_
;
return
;
}
# --------------[(combinable) means of adjusting the number of submitted workers]----------------------
sub
total_running_workers_limit
{
# if set and ->can('count_running_workers'),
...
...
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
View file @
c9b4cacb
...
...
@@ -77,6 +77,22 @@ sub kill_worker {
}
}
sub
find_out_cause
{
my
(
$self
,
$worker_pid
)
=
@_
;
my
$diagnostic_output
=
`
bacct -l
$worker_pid
`;
if
(
$diagnostic_output
=~
/TERM_MEMLIMIT: job killed/i
)
{
return
'
MEMLIMIT
';
}
elsif
(
$diagnostic_output
=~
/TERM_RUNLIMIT: job killed/i
)
{
return
'
RUNLIMIT
';
}
elsif
(
$diagnostic_output
=~
/TERM_OWNER: job killed/i
)
{
return
'
KILLED_BY_USER
';
}
return
;
}
sub
submit_workers
{
my
(
$self
,
$iteration
,
$worker_cmd
,
$worker_count
,
$rc_id
,
$rc_parameters
)
=
@_
;
...
...
modules/Bio/EnsEMBL/Hive/Queen.pm
View file @
c9b4cacb
...
...
@@ -212,7 +212,9 @@ sub check_for_dead_workers {
$worker_status_summary
{
$status
}
++
;
}
else
{
$worker_status_summary
{'
AWOL
'}
++
;
$worker
->
cause_of_death
('
FATALITY
');
my
$cause
=
$meadow
->
find_out_cause
(
$worker_pid
)
||
'
FATALITY
';
$worker
->
cause_of_death
(
$cause
);
$self
->
register_worker_death
(
$worker
);
}
}
...
...
sql/patch_2010-09-01.sql
0 → 100644
View file @
c9b4cacb
##
New
states
that
*
may
*
be
detected
by
timely
LSF
interrogation
.
ALTER
TABLE
hive
MODIFY
COLUMN
cause_of_death
enum
(
''
,
'NO_WORK'
,
'JOB_LIMIT'
,
'HIVE_OVERLOAD'
,
'LIFESPAN'
,
'CONTAMINATED'
,
'KILLED_BY_USER'
,
'MEMLIMIT'
,
'RUNLIMIT'
,
'FATALITY'
)
DEFAULT
''
NOT
NULL
;
sql/tables.sql
View file @
c9b4cacb
...
...
@@ -23,7 +23,7 @@ CREATE TABLE hive (
born
datetime
NOT
NULL
,
last_check_in
datetime
NOT
NULL
,
died
datetime
DEFAULT
NULL
,
cause_of_death
enum
(
''
,
'NO_WORK'
,
'JOB_LIMIT'
,
'HIVE_OVERLOAD'
,
'LIFESPAN'
,
'CONTAMINATED'
,
'KILLED_BY_USER'
,
'FATALITY'
)
DEFAULT
''
NOT
NULL
,
cause_of_death
enum
(
''
,
'NO_WORK'
,
'JOB_LIMIT'
,
'HIVE_OVERLOAD'
,
'LIFESPAN'
,
'CONTAMINATED'
,
'KILLED_BY_USER'
,
'MEMLIMIT'
,
'RUNLIMIT'
,
'FATALITY'
)
DEFAULT
''
NOT
NULL
,
PRIMARY
KEY
(
worker_id
),
INDEX
analysis_status
(
analysis_id
,
status
)
)
ENGINE
=
InnoDB
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment