Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ensembl-gh-mirror
ensembl-hive
Commits
efaa8c4c
Commit
efaa8c4c
authored
Apr 16, 2014
by
Leo Gordon
Browse files
record the completion datetime when registering worker death
parent
0b75aa0d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
46 additions
and
42 deletions
+46
-42
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
+20
-21
modules/Bio/EnsEMBL/Hive/Queen.pm
modules/Bio/EnsEMBL/Hive/Queen.pm
+26
-21
No files found.
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
View file @
efaa8c4c
...
...
@@ -201,10 +201,10 @@ sub parse_report_source_line {
if
(
my
(
$process_id
)
=
$lines
[
0
]
=~
/^Job <(\d+(?:\[\d+\])?)>/
)
{
my
(
$exit_status
,
$exception_status
)
=
(''
x
2
);
my
(
$
completion_datetime
,
$cause_of_death
);
my
(
$
died
,
$cause_of_death
);
foreach
(
@lines
)
{
if
(
/^(\w+\s+\w+\s+\d+\s+\d+:\d+:\d+):\s+Completed\s<(\w+)>(?:\.|;\s+(\w+))/
)
{
$
completion_datetime
=
_yearless_2_datetime
(
$
1
);
$
died
=
_yearless_2_datetime
(
$
1
);
$cause_of_death
=
$status_2_cod
{
$
3
};
$exit_status
=
$
2
.
(
$
3
?
"
/$3
"
:
'');
}
...
...
@@ -224,15 +224,18 @@ sub parse_report_source_line {
my
(
$swap_in_units
,
$swap_unit
)
=
$usage
{'
SWAP
'}
=~
/^([\d\.]+)([KMGT])$/
;
$report_entry
{
$process_id
}
=
{
'
completion_datetime
'
=>
$completion_datetime
,
'
cause_of_death
'
=>
$cause_of_death
,
'
exit_status
'
=>
$exit_status
,
'
exception_status
'
=>
$exception_status
,
'
mem_megs
'
=>
$mem_in_units
*
$units_2_megs
{
$mem_unit
},
'
swap_megs
'
=>
$swap_in_units
*
$units_2_megs
{
$swap_unit
},
'
pending_sec
'
=>
$usage
{'
WAIT
'},
'
cpu_sec
'
=>
$usage
{'
CPU_T
'},
'
lifespan_sec
'
=>
$usage
{'
TURNAROUND
'},
# entries for 'worker' table:
'
died
'
=>
$died
,
'
cause_of_death
'
=>
$cause_of_death
,
# entries for 'worker_resource_usage' table:
'
exit_status
'
=>
$exit_status
,
'
exception_status
'
=>
$exception_status
,
'
mem_megs
'
=>
$mem_in_units
*
$units_2_megs
{
$mem_unit
},
'
swap_megs
'
=>
$swap_in_units
*
$units_2_megs
{
$swap_unit
},
'
pending_sec
'
=>
$usage
{'
WAIT
'},
'
cpu_sec
'
=>
$usage
{'
CPU_T
'},
'
lifespan_sec
'
=>
$usage
{'
TURNAROUND
'},
};
}
}
...
...
@@ -242,26 +245,22 @@ sub parse_report_source_line {
}
sub
find_out_cause
s
{
sub
get_report_entries_for_process_id
s
{
my
$self
=
shift
@_
;
my
%c
auses_of_death
=
();
my
%c
ombined_report_entries
=
();
while
(
my
$pid_batch
=
join
('
',
map
{
"
'
$_
'
"
}
splice
(
@
_
,
0
,
20
)))
{
# can't fit too many pids on one shell cmdline
my
$cmd
=
"
bacct -l
$pid_batch
|
";
# warn "LSF::
f
ind_
out_causes
() running cmd:\n\t$cmd\n";
# warn "LSF::
get_comb
in
e
d_
report
() running cmd:\n\t$cmd\n";
my
$report_entries
=
parse_report_source_line
(
$cmd
);
my
$
batch_of_
report_entries
=
parse_report_source_line
(
$cmd
);
while
(
my
(
$process_id
,
$report_entry
)
=
each
%$report_entries
)
{
if
(
my
$cause_of_death
=
$report_entry
->
{'
cause_of_death
'})
{
$causes_of_death
{
$process_id
}
=
$cause_of_death
;
}
}
%combined_report_entries
=
(
%combined_report_entries
,
%$batch_of_report_entries
);
}
return
\
%c
auses_of_death
;
return
\
%c
ombined_report_entries
;
}
...
...
modules/Bio/EnsEMBL/Hive/Queen.pm
View file @
efaa8c4c
...
...
@@ -315,15 +315,16 @@ sub register_worker_death {
return
unless
(
$worker
);
my
$cod
=
$worker
->
cause_of_death
()
||
'
UNKNOWN
';
# make sure we do not attempt to insert a void
# FIXME: make it possible to set the 'died' timestamp if we have detected it from logs:
my
$sql
=
qq{UPDATE worker SET died=CURRENT_TIMESTAMP
}
.
(
$self_burial
?
'
,last_check_in=CURRENT_TIMESTAMP
'
:
'')
.
qq{
,status='DEAD'
,work_done='}
.
$worker
->
work_done
.
qq{'
,cause_of_death='$cod'
WHERE worker_id='}
.
$worker
->
dbID
.
qq{'}
;
my
$worker_id
=
$worker
->
dbID
;
my
$work_done
=
$worker
->
work_done
;
my
$cause_of_death
=
$worker
->
cause_of_death
||
'
UNKNOWN
';
# make sure we do not attempt to insert a void
my
$died
=
$worker
->
died
;
my
$sql
=
"
UPDATE worker SET status='DEAD', work_done='
$work_done
', cause_of_death='
$cause_of_death
'
"
.
(
$self_burial
?
'
, last_check_in=CURRENT_TIMESTAMP
'
:
''
)
.
(
$died
?
"
, died='
$died
'
"
:
'
, died=CURRENT_TIMESTAMP
'
)
.
"
WHERE worker_id='
$worker_id
'
";
$self
->
dbc
->
do
(
$sql
);
if
(
my
$analysis_id
=
$worker
->
analysis_id
)
{
...
...
@@ -333,10 +334,10 @@ sub register_worker_death {
$analysis_stats_adaptor
->
decrease_running_workers
(
$worker
->
analysis_id
);
}
unless
(
$c
od
eq
'
NO_WORK
'
or
$c
od
eq
'
JOB_LIMIT
'
or
$c
od
eq
'
HIVE_OVERLOAD
'
or
$c
od
eq
'
LIFESPAN
'
unless
(
$c
ause_of_death
eq
'
NO_WORK
'
or
$c
ause_of_death
eq
'
JOB_LIMIT
'
or
$c
ause_of_death
eq
'
HIVE_OVERLOAD
'
or
$c
ause_of_death
eq
'
LIFESPAN
'
)
{
$self
->
db
->
get_AnalysisJobAdaptor
->
release_undone_jobs_from_worker
(
$worker
);
}
...
...
@@ -351,7 +352,7 @@ sub register_worker_death {
}
sub
check_for_dead_workers
{
# scans the whole Valley for lost Workers (but ignores unreacha
g
le ones)
sub
check_for_dead_workers
{
# scans the whole Valley for lost Workers (but ignores unreacha
b
le ones)
my
(
$self
,
$valley
,
$check_buried_in_haste
)
=
@_
;
warn
"
GarbageCollector:
\t
Checking for lost Workers...
\n
";
...
...
@@ -396,10 +397,14 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i
if
(
my
$lost_this_meadow
=
scalar
(
keys
%$pid_to_lost_worker
)
)
{
warn
"
GarbageCollector:
\t
Discovered
$lost_this_meadow
lost
$meadow_type
Workers
\n
";
my
$wpid_to_cod
=
{};
my
$report_entries
=
{};
if
(
$this_meadow
->
can
('
find_out_causes
'))
{
$wpid_to_cod
=
$this_meadow
->
find_out_causes
(
keys
%$pid_to_lost_worker
);
my
$lost_with_known_cod
=
scalar
(
keys
%$wpid_to_cod
);
die
"
Your Meadow::
$meadow_type
driver now has to support get_report_entries_for_process_ids() method instead of find_out_causes(). Please update it.
\n
";
}
elsif
(
$this_meadow
->
can
('
get_report_entries_for_process_ids
'))
{
$report_entries
=
$this_meadow
->
get_report_entries_for_process_ids
(
keys
%$pid_to_lost_worker
);
my
$lost_with_known_cod
=
scalar
(
grep
{
$_
->
{'
cause_of_death
'}
}
values
%$report_entries
);
warn
"
GarbageCollector:
\t
Found why
$lost_with_known_cod
of
$meadow_type
Workers died
\n
";
}
else
{
warn
"
GarbageCollector:
\t
$meadow_type
meadow does not support post-mortem examination
\n
";
...
...
@@ -407,8 +412,9 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i
warn
"
GarbageCollector:
\t
Releasing the jobs
\n
";
while
(
my
(
$process_id
,
$worker
)
=
each
%$pid_to_lost_worker
)
{
$worker
->
cause_of_death
(
$wpid_to_cod
->
{
$process_id
}
||
'
UNKNOWN
');
$self
->
register_worker_death
(
$worker
);
$worker
->
died
(
$report_entries
->
{
$process_id
}{'
died
'}
);
$worker
->
cause_of_death
(
$report_entries
->
{
$process_id
}{'
cause_of_death
'}
);
$self
->
register_worker_death
(
$worker
);
}
}
}
...
...
@@ -741,8 +747,7 @@ sub register_all_workers_dead {
my
$all_workers_considered_alive
=
$self
->
fetch_all
(
"
status!='DEAD'
"
);
foreach
my
$worker
(
@
{
$all_workers_considered_alive
})
{
$worker
->
cause_of_death
(
'
UNKNOWN
'
);
# well, maybe we could have investigated further...
$self
->
register_worker_death
(
$worker
);
$self
->
register_worker_death
(
$worker
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment