Commit efaa8c4c authored by Leo Gordon's avatar Leo Gordon
Browse files

record the completion datetime when registering worker death

parent 0b75aa0d
......@@ -201,10 +201,10 @@ sub parse_report_source_line {
if( my ($process_id) = $lines[0]=~/^Job <(\d+(?:\[\d+\])?)>/) {
my ($exit_status, $exception_status) = ('' x 2);
my ($completion_datetime, $cause_of_death);
my ($died, $cause_of_death);
foreach (@lines) {
if( /^(\w+\s+\w+\s+\d+\s+\d+:\d+:\d+):\s+Completed\s<(\w+)>(?:\.|;\s+(\w+))/ ) {
$completion_datetime = _yearless_2_datetime($1);
$died = _yearless_2_datetime($1);
$cause_of_death = $status_2_cod{$3};
$exit_status = $2 . ($3 ? "/$3" : '');
}
......@@ -224,15 +224,18 @@ sub parse_report_source_line {
my ($swap_in_units, $swap_unit) = $usage{'SWAP'} =~ /^([\d\.]+)([KMGT])$/;
$report_entry{ $process_id } = {
'completion_datetime' => $completion_datetime,
'cause_of_death' => $cause_of_death,
'exit_status' => $exit_status,
'exception_status' => $exception_status,
'mem_megs' => $mem_in_units * $units_2_megs{$mem_unit},
'swap_megs' => $swap_in_units * $units_2_megs{$swap_unit},
'pending_sec' => $usage{'WAIT'},
'cpu_sec' => $usage{'CPU_T'},
'lifespan_sec' => $usage{'TURNAROUND'},
# entries for 'worker' table:
'died' => $died,
'cause_of_death' => $cause_of_death,
# entries for 'worker_resource_usage' table:
'exit_status' => $exit_status,
'exception_status' => $exception_status,
'mem_megs' => $mem_in_units * $units_2_megs{$mem_unit},
'swap_megs' => $swap_in_units * $units_2_megs{$swap_unit},
'pending_sec' => $usage{'WAIT'},
'cpu_sec' => $usage{'CPU_T'},
'lifespan_sec' => $usage{'TURNAROUND'},
};
}
}
......@@ -242,26 +245,22 @@ sub parse_report_source_line {
}
sub find_out_causes {
sub get_report_entries_for_process_ids {
my $self = shift @_;
my %causes_of_death = ();
my %combined_report_entries = ();
while (my $pid_batch = join(' ', map { "'$_'" } splice(@_, 0, 20))) { # can't fit too many pids on one shell cmdline
my $cmd = "bacct -l $pid_batch |";
# warn "LSF::find_out_causes() running cmd:\n\t$cmd\n";
# warn "LSF::get_combined_report() running cmd:\n\t$cmd\n";
my $report_entries = parse_report_source_line( $cmd );
my $batch_of_report_entries = parse_report_source_line( $cmd );
while( my ($process_id, $report_entry) = each %$report_entries ) {
if(my $cause_of_death = $report_entry->{'cause_of_death'}) {
$causes_of_death{ $process_id } = $cause_of_death;
}
}
%combined_report_entries = (%combined_report_entries, %$batch_of_report_entries);
}
return \%causes_of_death;
return \%combined_report_entries;
}
......
......@@ -315,15 +315,16 @@ sub register_worker_death {
return unless($worker);
my $cod = $worker->cause_of_death() || 'UNKNOWN'; # make sure we do not attempt to insert a void
# FIXME: make it possible to set the 'died' timestamp if we have detected it from logs:
my $sql = qq{UPDATE worker SET died=CURRENT_TIMESTAMP
} . ( $self_burial ? ',last_check_in=CURRENT_TIMESTAMP ' : '') . qq{
,status='DEAD'
,work_done='}. $worker->work_done . qq{'
,cause_of_death='$cod'
WHERE worker_id='}. $worker->dbID . qq{'};
my $worker_id = $worker->dbID;
my $work_done = $worker->work_done;
my $cause_of_death = $worker->cause_of_death || 'UNKNOWN'; # make sure we do not attempt to insert a void
my $died = $worker->died;
my $sql = "UPDATE worker SET status='DEAD', work_done='$work_done', cause_of_death='$cause_of_death'"
. ( $self_burial ? ', last_check_in=CURRENT_TIMESTAMP ' : '' )
. ( $died ? ", died='$died'" : ', died=CURRENT_TIMESTAMP' )
. " WHERE worker_id='$worker_id' ";
$self->dbc->do( $sql );
if(my $analysis_id = $worker->analysis_id) {
......@@ -333,10 +334,10 @@ sub register_worker_death {
$analysis_stats_adaptor->decrease_running_workers($worker->analysis_id);
}
unless( $cod eq 'NO_WORK'
or $cod eq 'JOB_LIMIT'
or $cod eq 'HIVE_OVERLOAD'
or $cod eq 'LIFESPAN'
unless( $cause_of_death eq 'NO_WORK'
or $cause_of_death eq 'JOB_LIMIT'
or $cause_of_death eq 'HIVE_OVERLOAD'
or $cause_of_death eq 'LIFESPAN'
) {
$self->db->get_AnalysisJobAdaptor->release_undone_jobs_from_worker($worker);
}
......@@ -351,7 +352,7 @@ sub register_worker_death {
}
sub check_for_dead_workers { # scans the whole Valley for lost Workers (but ignores unreachagle ones)
sub check_for_dead_workers { # scans the whole Valley for lost Workers (but ignores unreachable ones)
my ($self, $valley, $check_buried_in_haste) = @_;
warn "GarbageCollector:\tChecking for lost Workers...\n";
......@@ -396,10 +397,14 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i
if(my $lost_this_meadow = scalar(keys %$pid_to_lost_worker) ) {
warn "GarbageCollector:\tDiscovered $lost_this_meadow lost $meadow_type Workers\n";
my $wpid_to_cod = {};
my $report_entries = {};
if($this_meadow->can('find_out_causes')) {
$wpid_to_cod = $this_meadow->find_out_causes( keys %$pid_to_lost_worker );
my $lost_with_known_cod = scalar(keys %$wpid_to_cod);
die "Your Meadow::$meadow_type driver now has to support get_report_entries_for_process_ids() method instead of find_out_causes(). Please update it.\n";
} elsif($this_meadow->can('get_report_entries_for_process_ids')) {
$report_entries = $this_meadow->get_report_entries_for_process_ids( keys %$pid_to_lost_worker );
my $lost_with_known_cod = scalar( grep { $_->{'cause_of_death'} } values %$report_entries);
warn "GarbageCollector:\tFound why $lost_with_known_cod of $meadow_type Workers died\n";
} else {
warn "GarbageCollector:\t$meadow_type meadow does not support post-mortem examination\n";
......@@ -407,8 +412,9 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i
warn "GarbageCollector:\tReleasing the jobs\n";
while(my ($process_id, $worker) = each %$pid_to_lost_worker) {
$worker->cause_of_death( $wpid_to_cod->{$process_id} || 'UNKNOWN');
$self->register_worker_death($worker);
$worker->died( $report_entries->{$process_id}{'died'} );
$worker->cause_of_death( $report_entries->{$process_id}{'cause_of_death'} );
$self->register_worker_death( $worker );
}
}
}
......@@ -741,8 +747,7 @@ sub register_all_workers_dead {
my $all_workers_considered_alive = $self->fetch_all( "status!='DEAD'" );
foreach my $worker (@{$all_workers_considered_alive}) {
$worker->cause_of_death( 'UNKNOWN' ); # well, maybe we could have investigated further...
$self->register_worker_death($worker);
$self->register_worker_death( $worker );
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment