Commit efaa8c4c authored by Leo Gordon's avatar Leo Gordon
Browse files

record the completion datetime when registering worker death

parent 0b75aa0d
...@@ -201,10 +201,10 @@ sub parse_report_source_line { ...@@ -201,10 +201,10 @@ sub parse_report_source_line {
if( my ($process_id) = $lines[0]=~/^Job <(\d+(?:\[\d+\])?)>/) { if( my ($process_id) = $lines[0]=~/^Job <(\d+(?:\[\d+\])?)>/) {
my ($exit_status, $exception_status) = ('' x 2); my ($exit_status, $exception_status) = ('' x 2);
my ($completion_datetime, $cause_of_death); my ($died, $cause_of_death);
foreach (@lines) { foreach (@lines) {
if( /^(\w+\s+\w+\s+\d+\s+\d+:\d+:\d+):\s+Completed\s<(\w+)>(?:\.|;\s+(\w+))/ ) { if( /^(\w+\s+\w+\s+\d+\s+\d+:\d+:\d+):\s+Completed\s<(\w+)>(?:\.|;\s+(\w+))/ ) {
$completion_datetime = _yearless_2_datetime($1); $died = _yearless_2_datetime($1);
$cause_of_death = $status_2_cod{$3}; $cause_of_death = $status_2_cod{$3};
$exit_status = $2 . ($3 ? "/$3" : ''); $exit_status = $2 . ($3 ? "/$3" : '');
} }
...@@ -224,15 +224,18 @@ sub parse_report_source_line { ...@@ -224,15 +224,18 @@ sub parse_report_source_line {
my ($swap_in_units, $swap_unit) = $usage{'SWAP'} =~ /^([\d\.]+)([KMGT])$/; my ($swap_in_units, $swap_unit) = $usage{'SWAP'} =~ /^([\d\.]+)([KMGT])$/;
$report_entry{ $process_id } = { $report_entry{ $process_id } = {
'completion_datetime' => $completion_datetime, # entries for 'worker' table:
'cause_of_death' => $cause_of_death, 'died' => $died,
'exit_status' => $exit_status, 'cause_of_death' => $cause_of_death,
'exception_status' => $exception_status,
'mem_megs' => $mem_in_units * $units_2_megs{$mem_unit}, # entries for 'worker_resource_usage' table:
'swap_megs' => $swap_in_units * $units_2_megs{$swap_unit}, 'exit_status' => $exit_status,
'pending_sec' => $usage{'WAIT'}, 'exception_status' => $exception_status,
'cpu_sec' => $usage{'CPU_T'}, 'mem_megs' => $mem_in_units * $units_2_megs{$mem_unit},
'lifespan_sec' => $usage{'TURNAROUND'}, 'swap_megs' => $swap_in_units * $units_2_megs{$swap_unit},
'pending_sec' => $usage{'WAIT'},
'cpu_sec' => $usage{'CPU_T'},
'lifespan_sec' => $usage{'TURNAROUND'},
}; };
} }
} }
...@@ -242,26 +245,22 @@ sub parse_report_source_line { ...@@ -242,26 +245,22 @@ sub parse_report_source_line {
} }
sub find_out_causes { sub get_report_entries_for_process_ids {
my $self = shift @_; my $self = shift @_;
my %causes_of_death = (); my %combined_report_entries = ();
while (my $pid_batch = join(' ', map { "'$_'" } splice(@_, 0, 20))) { # can't fit too many pids on one shell cmdline while (my $pid_batch = join(' ', map { "'$_'" } splice(@_, 0, 20))) { # can't fit too many pids on one shell cmdline
my $cmd = "bacct -l $pid_batch |"; my $cmd = "bacct -l $pid_batch |";
# warn "LSF::find_out_causes() running cmd:\n\t$cmd\n"; # warn "LSF::get_combined_report() running cmd:\n\t$cmd\n";
my $report_entries = parse_report_source_line( $cmd ); my $batch_of_report_entries = parse_report_source_line( $cmd );
while( my ($process_id, $report_entry) = each %$report_entries ) { %combined_report_entries = (%combined_report_entries, %$batch_of_report_entries);
if(my $cause_of_death = $report_entry->{'cause_of_death'}) {
$causes_of_death{ $process_id } = $cause_of_death;
}
}
} }
return \%causes_of_death; return \%combined_report_entries;
} }
......
...@@ -315,15 +315,16 @@ sub register_worker_death { ...@@ -315,15 +315,16 @@ sub register_worker_death {
return unless($worker); return unless($worker);
my $cod = $worker->cause_of_death() || 'UNKNOWN'; # make sure we do not attempt to insert a void my $worker_id = $worker->dbID;
my $work_done = $worker->work_done;
# FIXME: make it possible to set the 'died' timestamp if we have detected it from logs: my $cause_of_death = $worker->cause_of_death || 'UNKNOWN'; # make sure we do not attempt to insert a void
my $sql = qq{UPDATE worker SET died=CURRENT_TIMESTAMP my $died = $worker->died;
} . ( $self_burial ? ',last_check_in=CURRENT_TIMESTAMP ' : '') . qq{
,status='DEAD' my $sql = "UPDATE worker SET status='DEAD', work_done='$work_done', cause_of_death='$cause_of_death'"
,work_done='}. $worker->work_done . qq{' . ( $self_burial ? ', last_check_in=CURRENT_TIMESTAMP ' : '' )
,cause_of_death='$cod' . ( $died ? ", died='$died'" : ', died=CURRENT_TIMESTAMP' )
WHERE worker_id='}. $worker->dbID . qq{'}; . " WHERE worker_id='$worker_id' ";
$self->dbc->do( $sql ); $self->dbc->do( $sql );
if(my $analysis_id = $worker->analysis_id) { if(my $analysis_id = $worker->analysis_id) {
...@@ -333,10 +334,10 @@ sub register_worker_death { ...@@ -333,10 +334,10 @@ sub register_worker_death {
$analysis_stats_adaptor->decrease_running_workers($worker->analysis_id); $analysis_stats_adaptor->decrease_running_workers($worker->analysis_id);
} }
unless( $cod eq 'NO_WORK' unless( $cause_of_death eq 'NO_WORK'
or $cod eq 'JOB_LIMIT' or $cause_of_death eq 'JOB_LIMIT'
or $cod eq 'HIVE_OVERLOAD' or $cause_of_death eq 'HIVE_OVERLOAD'
or $cod eq 'LIFESPAN' or $cause_of_death eq 'LIFESPAN'
) { ) {
$self->db->get_AnalysisJobAdaptor->release_undone_jobs_from_worker($worker); $self->db->get_AnalysisJobAdaptor->release_undone_jobs_from_worker($worker);
} }
...@@ -351,7 +352,7 @@ sub register_worker_death { ...@@ -351,7 +352,7 @@ sub register_worker_death {
} }
sub check_for_dead_workers { # scans the whole Valley for lost Workers (but ignores unreachagle ones) sub check_for_dead_workers { # scans the whole Valley for lost Workers (but ignores unreachable ones)
my ($self, $valley, $check_buried_in_haste) = @_; my ($self, $valley, $check_buried_in_haste) = @_;
warn "GarbageCollector:\tChecking for lost Workers...\n"; warn "GarbageCollector:\tChecking for lost Workers...\n";
...@@ -396,10 +397,14 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i ...@@ -396,10 +397,14 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i
if(my $lost_this_meadow = scalar(keys %$pid_to_lost_worker) ) { if(my $lost_this_meadow = scalar(keys %$pid_to_lost_worker) ) {
warn "GarbageCollector:\tDiscovered $lost_this_meadow lost $meadow_type Workers\n"; warn "GarbageCollector:\tDiscovered $lost_this_meadow lost $meadow_type Workers\n";
my $wpid_to_cod = {}; my $report_entries = {};
if($this_meadow->can('find_out_causes')) { if($this_meadow->can('find_out_causes')) {
$wpid_to_cod = $this_meadow->find_out_causes( keys %$pid_to_lost_worker ); die "Your Meadow::$meadow_type driver now has to support get_report_entries_for_process_ids() method instead of find_out_causes(). Please update it.\n";
my $lost_with_known_cod = scalar(keys %$wpid_to_cod);
} elsif($this_meadow->can('get_report_entries_for_process_ids')) {
$report_entries = $this_meadow->get_report_entries_for_process_ids( keys %$pid_to_lost_worker );
my $lost_with_known_cod = scalar( grep { $_->{'cause_of_death'} } values %$report_entries);
warn "GarbageCollector:\tFound why $lost_with_known_cod of $meadow_type Workers died\n"; warn "GarbageCollector:\tFound why $lost_with_known_cod of $meadow_type Workers died\n";
} else { } else {
warn "GarbageCollector:\t$meadow_type meadow does not support post-mortem examination\n"; warn "GarbageCollector:\t$meadow_type meadow does not support post-mortem examination\n";
...@@ -407,8 +412,9 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i ...@@ -407,8 +412,9 @@ sub check_for_dead_workers { # scans the whole Valley for lost Workers (but i
warn "GarbageCollector:\tReleasing the jobs\n"; warn "GarbageCollector:\tReleasing the jobs\n";
while(my ($process_id, $worker) = each %$pid_to_lost_worker) { while(my ($process_id, $worker) = each %$pid_to_lost_worker) {
$worker->cause_of_death( $wpid_to_cod->{$process_id} || 'UNKNOWN'); $worker->died( $report_entries->{$process_id}{'died'} );
$self->register_worker_death($worker); $worker->cause_of_death( $report_entries->{$process_id}{'cause_of_death'} );
$self->register_worker_death( $worker );
} }
} }
} }
...@@ -741,8 +747,7 @@ sub register_all_workers_dead { ...@@ -741,8 +747,7 @@ sub register_all_workers_dead {
my $all_workers_considered_alive = $self->fetch_all( "status!='DEAD'" ); my $all_workers_considered_alive = $self->fetch_all( "status!='DEAD'" );
foreach my $worker (@{$all_workers_considered_alive}) { foreach my $worker (@{$all_workers_considered_alive}) {
$worker->cause_of_death( 'UNKNOWN' ); # well, maybe we could have investigated further... $self->register_worker_death( $worker );
$self->register_worker_death($worker);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment