Commit 0be9a5be authored by Leo Gordon's avatar Leo Gordon
Browse files

experimental: detect and bury the UNKWN workers -- the user takes...

experimental: detect and bury the UNKWN workers -- the user takes responsibility by running -unkwn flag
parent c2c0e260
......@@ -123,7 +123,7 @@ sub check_worker_is_alive_and_mine {
sub kill_worker {
my $worker = pop @_;
my ($self, $worker, $fast) = @_;
my $cmd = 'kill -9 '.$worker->process_id();
system($cmd);
......
......@@ -160,9 +160,11 @@ sub check_worker_is_alive_and_mine {
sub kill_worker {
my $worker = pop @_;
my ($self, $worker, $fast) = @_;
my $cmd = 'bkill '.$worker->process_id();
my $fast_flag = $fast ? '-r ' : '';
my $cmd = "bkill $fast_flag".$worker->process_id();
# warn "LSF::kill_worker() running cmd:\n\t$cmd\n";
......
......@@ -335,7 +335,7 @@ sub meadow_type_2_name_2_users_of_running_workers {
sub check_for_dead_workers { # scans the whole Valley for lost Workers (but ignores unreachable ones)
my ($self, $valley, $check_buried_in_haste) = @_;
my ($self, $valley, $check_buried_in_haste, $bury_unkwn_workers) = @_;
my $last_few_seconds = 5; # FIXME: It is probably a good idea to expose this parameter for easier tuning.
......@@ -374,7 +374,22 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i
my $meadow_type = $worker->meadow_type;
my $process_id = $worker->process_id;
if(my $status = $pid_to_worker_status->{$process_id}) { # can be RUN|PEND|xSUSP
my $status = $pid_to_worker_status->{$process_id};
if($bury_unkwn_workers and ($status eq 'UNKWN')) {
if( my $meadow = $valley->find_available_meadow_responsible_for_worker( $worker ) ) {
if($meadow->can('kill_worker')) {
if($worker->meadow_user eq $ENV{'USER'}) { # if I'm actually allowed to kill the worker...
warn "GarbageCollector:\tKilling/forgetting the UNKWN worker by process_id $process_id";
# $meadow->kill_worker($worker, 1);
# $status = ''; # make it look like LOST
}
}
}
}
if($status) { # can be RUN|PEND|xSUSP
$meadow_status_counts{$meadow_signature}{$status}++;
# only prepare once at most:
......
......@@ -50,6 +50,7 @@ sub main {
my $force = undef;
my $keep_alive = 0; # ==1 means run even when there is nothing to do
my $check_for_dead = 0;
my $bury_unkwn_workers = 0;
my $all_dead = 0;
my $balance_semaphores = 0;
my $job_id_for_output = 0;
......@@ -118,6 +119,7 @@ sub main {
'v|versions!' => \$report_versions,
'sync!' => \$sync,
'dead!' => \$check_for_dead,
'unkwn!' => \$bury_unkwn_workers,
'killworker=i' => \$kill_worker_id,
'alldead!' => \$all_dead,
'balance_semaphores'=> \$balance_semaphores,
......@@ -283,6 +285,7 @@ sub main {
if($all_dead) { $queen->register_all_workers_dead(); }
if($check_for_dead) { $queen->check_for_dead_workers($valley, 1); }
if($bury_unkwn_workers) { $queen->check_for_dead_workers($valley, 1, 1); }
if($balance_semaphores) { $self->{'dba'}->get_AnalysisJobAdaptor->balance_semaphores( $list_of_analyses ); }
if ($max_loops) { # positive $max_loop means limited, negative means unlimited
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment