Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ensembl-gh-mirror
ensembl-hive
Commits
654f6481
Commit
654f6481
authored
Sep 03, 2010
by
Leo Gordon
Browse files
do all baccts in one go - should save some time
parent
cbd22c2e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
39 additions
and
21 deletions
+39
-21
modules/Bio/EnsEMBL/Hive/Meadow.pm
modules/Bio/EnsEMBL/Hive/Meadow.pm
+0
-6
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
+19
-12
modules/Bio/EnsEMBL/Hive/Queen.pm
modules/Bio/EnsEMBL/Hive/Queen.pm
+20
-3
No files found.
modules/Bio/EnsEMBL/Hive/Meadow.pm
View file @
654f6481
...
...
@@ -67,12 +67,6 @@ sub kill_worker {
die
"
Please use a derived method
";
}
sub
find_out_cause
{
# parent assumes agnostic stance
my
(
$self
,
$worker_pid
)
=
@_
;
return
;
}
# --------------[(combinable) means of adjusting the number of submitted workers]----------------------
sub
total_running_workers_limit
{
# if set and ->can('count_running_workers'),
...
...
modules/Bio/EnsEMBL/Hive/Meadow/LSF.pm
View file @
654f6481
...
...
@@ -77,22 +77,29 @@ sub kill_worker {
}
}
sub
find_out_cause
{
my
(
$self
,
$worker_pid
)
=
@_
;
my
$diagnostic_output
=
`
bacct -l '
$worker_pid
'
`;
if
(
$diagnostic_output
=~
/TERM_MEMLIMIT: job killed/i
)
{
return
'
MEMLIMIT
';
}
elsif
(
$diagnostic_output
=~
/TERM_RUNLIMIT: job killed/i
)
{
return
'
RUNLIMIT
';
}
elsif
(
$diagnostic_output
=~
/TERM_OWNER: job killed/i
)
{
return
'
KILLED_BY_USER
';
sub
find_out_causes
{
my
$self
=
shift
@_
;
my
%lsf_2_hive
=
(
'
TERM_MEMLIMIT
'
=>
'
MEMLIMIT
',
'
TERM_RUNLIMIT
'
=>
'
RUNLIMIT
',
'
TERM_OWNER
'
=>
'
KILLED_BY_USER
',
);
my
%cod
=
();
my
$pid_batch
=
join
('
',
@
_
);
# FIXME: it should be done in several batches
my
$ba_out
=
`
bacct -l
$pid_batch
`;
foreach
my
$section
(
split
(
/\-{10,}\s+/
,
$ba_out
))
{
if
(
$section
=~
/^Job <(\d+(?:\[\d+\]))>.+(TERM_MEMLIMIT|TERM_RUNLIMIT|TERM_OWNER): job killed/is
)
{
$cod
{
$
1
}
=
$lsf_2_hive
{
$
2
};
}
}
return
;
return
\
%cod
;
}
sub
submit_workers
{
my
(
$self
,
$iteration
,
$worker_cmd
,
$worker_count
,
$rc_id
,
$rc_parameters
)
=
@_
;
...
...
modules/Bio/EnsEMBL/Hive/Queen.pm
View file @
654f6481
...
...
@@ -207,6 +207,8 @@ sub check_for_dead_workers {
print
"
====== Live workers according to Queen:
"
.
scalar
(
@$queen_worker_list
)
.
"
, Meadow:
"
.
scalar
(
keys
%$worker_status_hash
)
.
"
\n
";
my
%gc_wpid_to_worker
=
();
foreach
my
$worker
(
@$queen_worker_list
)
{
next
unless
(
$meadow
->
responsible_for_worker
(
$worker
));
...
...
@@ -216,13 +218,28 @@ sub check_for_dead_workers {
}
else
{
$worker_status_summary
{'
AWOL
'}
++
;
my
$cod
=
$meadow
->
find_out_cause
(
$worker_pid
)
||
'
FATALITY
';
$worker
->
cause_of_death
(
$cod
);
$self
->
register_worker_death
(
$worker
);
$gc_wpid_to_worker
{
$worker_pid
}
=
$worker
;
}
}
print
"
\t
"
.
join
('
,
',
map
{
"
$_
:
$worker_status_summary
{
$_
}
"
}
keys
%worker_status_summary
)
.
"
\n\n
";
if
(
my
$total_lost
=
scalar
(
keys
%gc_wpid_to_worker
))
{
warn
"
GarbageCollector: Discovered
$total_lost
lost workers
\n
";
my
$wpid_to_cod
=
{};
if
(
UNIVERSAL::
can
(
$meadow
,
'
find_out_causes
'))
{
$wpid_to_cod
=
$meadow
->
find_out_causes
(
keys
%gc_wpid_to_worker
);
my
$lost_with_known_cod
=
scalar
(
keys
%$wpid_to_cod
);
warn
"
GarbageCollector: Found why
$lost_with_known_cod
of them died
\n
";
}
warn
"
GarbageCollector: Releasing the jobs
\n
";
while
(
my
(
$worker_pid
,
$worker
)
=
each
%gc_wpid_to_worker
)
{
$worker
->
cause_of_death
(
$wpid_to_cod
->
{
$worker_pid
}
||
'
FATALITY
');
$self
->
register_worker_death
(
$worker
);
}
}
if
(
$check_buried_in_haste
)
{
print
"
====== Checking for workers buried in haste...
";
my
$buried_in_haste_list
=
$self
->
fetch_dead_workers_with_jobs
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment