Commit fd570c6a authored by Jessica Severin's avatar Jessica Severin
Browse files

added method to show_overdue_workers so now when run without other options

it syncs and displays a full summary of the state of the hive including
what workers were overdue, how many are needed, and what workers are running.
Also changed the check_for_dead to use the LSF bjobs command since I now
store an LSF job_id and array_index in the process_id for LSF workers.
Also changed the overdue time limit to 75minutes since the expected lifetime
is 60minutes.
parent fb980e48
...@@ -74,8 +74,8 @@ my $queen = $DBA->get_Queen; ...@@ -74,8 +74,8 @@ my $queen = $DBA->get_Queen;
if($self->{'all_dead'}) { register_all_workers_dead($self, $queen); } if($self->{'all_dead'}) { register_all_workers_dead($self, $queen); }
if($loopit) { if($loopit) {
run_autonomously($self, $queen); run_autonomously($self, $queen);
} else { } else {
if($self->{'check_for_dead'}) { check_for_dead_workers($self, $queen); } if($self->{'check_for_dead'}) { check_for_dead_workers($self, $queen); }
...@@ -85,9 +85,12 @@ if($loopit) { ...@@ -85,9 +85,12 @@ if($loopit) {
$queen->get_num_needed_workers(); $queen->get_num_needed_workers();
run_next_worker_clutch($self, $queen); run_next_worker_clutch($self, $queen) if($self->{'run'});
show_overdue_workers($self, $queen);
} }
Bio::EnsEMBL::Hive::URLFactory->cleanup; Bio::EnsEMBL::Hive::URLFactory->cleanup;
exit(0); exit(0);
...@@ -177,12 +180,13 @@ sub check_for_dead_workers { ...@@ -177,12 +180,13 @@ sub check_for_dead_workers {
my $self = shift; my $self = shift;
my $queen = shift; my $queen = shift;
print("===== check for dead workers\n");
my $overdueWorkers = $queen->fetch_overdue_workers(75*60); #overdue by 75 minutes my $overdueWorkers = $queen->fetch_overdue_workers(75*60); #overdue by 75 minutes
print(scalar(@{$overdueWorkers}), " overdue workers\n"); print(scalar(@{$overdueWorkers}), " overdue workers\n");
foreach my $worker (@{$overdueWorkers}) { foreach my $worker (@{$overdueWorkers}) {
if($worker->beekeeper eq 'LSF') { if($worker->beekeeper eq 'LSF') {
printf("%10d %20s analysis_id=%d : ", $worker->hive_id,$worker->host, $worker->analysis->dbID); printf("%10d %35s %15s %20s(%d) : ", $worker->hive_id,$worker->host,$worker->process_id, $worker->analysis->logic_name, $worker->analysis->dbID);
my $cmd = "ssh -x ". $worker->host . " ps -p ". $worker->process_id . "|grep -v PID"; my $cmd = "bjobs ". $worker->process_id . " 2>&1 | grep -v 'not found' | grep -v JOBID";
#print(" check worker with : $cmd\n"); #print(" check worker with : $cmd\n");
my $check = qx/$cmd/; my $check = qx/$cmd/;
...@@ -209,6 +213,18 @@ sub register_all_workers_dead { ...@@ -209,6 +213,18 @@ sub register_all_workers_dead {
} }
sub show_overdue_workers {
my $self = shift;
my $queen = shift;
print("===== overdue workers\n");
my $overdueWorkers = $queen->fetch_overdue_workers(75*60);
foreach my $worker (@{$overdueWorkers}) {
printf("%10d %35s %15s %20s(%d)\n", $worker->hive_id,$worker->host,$worker->process_id, $worker->analysis->logic_name, $worker->analysis->dbID);
}
}
sub run_autonomously { sub run_autonomously {
my $self = shift; my $self = shift;
my $queen = shift; my $queen = shift;
...@@ -217,6 +233,7 @@ sub run_autonomously { ...@@ -217,6 +233,7 @@ sub run_autonomously {
my $loopCount=1; my $loopCount=1;
while($loopit) { while($loopit) {
print("\n=======lsf_beekeeper loop ** $loopCount **==========\n"); print("\n=======lsf_beekeeper loop ** $loopCount **==========\n");
check_for_dead_workers($self, $queen); check_for_dead_workers($self, $queen);
$queen->update_analysis_stats(); $queen->update_analysis_stats();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment