Skip to content
Snippets Groups Projects
Commit f731a413 authored by Matthieu Muffato's avatar Matthieu Muffato
Browse files

Gets the birth/death events instead of sampling the database. The "NOTHING"...

Gets the birth/death events instead of sampling the database. The "NOTHING" curve is not needed any more
parent 819d3850
No related branches found
No related tags found
No related merge requests found
......@@ -29,7 +29,7 @@ exit(0);
sub main {
my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $start_date, $end_date, $granularity, $skip, $output, $top);
my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $start_date, $end_date, $output, $top, $logscale);
GetOptions(
# connect to the database:
......@@ -41,9 +41,8 @@ sub main {
'start_date=s' => \$start_date,
'end_date=s' => \$end_date,
'granularity=i' => \$granularity,
'skip_no_activity=i' => \$skip,
'top=f' => \$top,
'log=i' => \$logscale,
'output=s' => \$output,
'h|help' => \$help,
);
......@@ -65,12 +64,7 @@ sub main {
}
# Palette generated with R: c(brewer.pal(9, "Set1"), brewer.pal(12, "Set3")). #FFFFB3 is removed because it is too close to white
my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999 #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F);
# Default options
$granularity = 5 unless $granularity;
$skip = int(($skip || 2*60) / $granularity);
$top = scalar(@palette)-1 unless $top;
my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999 #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F #2F4F4F);
my %terminal_mapping = (
'emf' => 'emf',
......@@ -89,148 +83,136 @@ sub main {
}
my $activity_title = 'ACTIVITY';
my $dbh = $hive_dba->dbc->db_handle();
my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
my @birth_death_dates = sort {$a->[0] cmp $b->[0]} @tmp_dates;
# Get the events from the database
my %events = ();
{
my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
warn scalar(@tmp_dates), " events\n";
warn scalar(@birth_death_dates), " events\n";
#die Dumper $birth_death_dates[0];
foreach my $db_entry (@tmp_dates) {
my ($event_date, $analysis_id, $offset) = @$db_entry;
$events{$event_date}{$analysis_id} += $offset;
}
}
my @event_dates = sort {$a cmp $b} (keys %events);
warn scalar(@event_dates), " dates\n";
my $sql_analysis_names = 'SELECT analysis_id, logic_name FROM analysis_base';
my $data = $dbh->selectall_arrayref($sql_analysis_names);
my %name = (map {$_->[0] => $_->[1] } @$data);
#die Dumper \%name;
my @analysis_data = @{$dbh->selectall_arrayref($sql_analysis_names)};
my %name = (map {$_->[0] => $_->[1] } @analysis_data);
warn scalar(@analysis_data), " analysis\n";
my $max_workers = 0;
my @data_timings = ();
my %tot_analysis = ();
my $sum_a = 0;
my %tmp_interval;
my @activity;
while (scalar(@birth_death_dates)) {
my $num_curr_workers = 0;
my %hash_curr_workers = (map {$_->[0] => 0 } @analysis_data);
foreach my $event_date (@event_dates) {
my ($event_date, $analysis_id, $offset) = @{shift @birth_death_dates};
last if $event_date gt $end_date;
last if $end_date and ($event_date gt $end_date);
$tmp_interval{$analysis_id} += $offset;
$sum_a += $offset;
my %hash_interval = %tmp_interval;
my $topup_hash = $events{$event_date};
foreach my $analysis_id (keys %$topup_hash) {
$hash_curr_workers{$analysis_id} += $topup_hash->{$analysis_id};
$num_curr_workers += $topup_hash->{$analysis_id};
}
die if sum(values %hash_curr_workers) != $num_curr_workers;
next if $start_date and ($event_date lt $start_date);
my %hash_interval = %hash_curr_workers;
#FIXME It should be normalised by the length of the time interval
map {$tot_analysis{$_} += $hash_interval{$_}} keys %hash_interval;
$max_workers = $sum_a if ($sum_a > $max_workers);
next if $event_date lt $start_date;
$max_workers = $num_curr_workers if ($num_curr_workers > $max_workers);
# We can store the data
# We need to repeat the previous value to have an histogram shape
push @data_timings, [$event_date, $data_timings[-1]->[1]] if @data_timings;
push @data_timings, [$event_date, \%hash_interval];
unless ($sum_a) {
push @activity, [$event_date, 1];
push @activity, [$event_date, 0];
push @activity, [$birth_death_dates[0]->[0], 0] if scalar(@birth_death_dates);
push @activity, [$birth_death_dates[0]->[0], 1] if scalar(@birth_death_dates);
}
}
warn $max_workers;
warn Dumper \%tot_analysis;
my $total_total = sum(values %tot_analysis);
my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $name{$a} cmp lc $name{$b})} keys %tot_analysis;
my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $name{$a} cmp lc $name{$b})} (grep {$tot_analysis{$_}} keys %tot_analysis);
warn Dumper \@sorted_analysis_ids;
warn Dumper([map {$name{$_}} @sorted_analysis_ids]);
if (not $gnuplot_terminal) {
print join("\t", 'analysis', $activity_title, map {$name{$_}} @sorted_analysis_ids), "\n";
print join("\t", 'date', 'OVERALL', map {$name{$_}} @sorted_analysis_ids), "\n";
print join("\t", 'total', $total_total, map {$tot_analysis{$_}} @sorted_analysis_ids), "\n";
print join("\t", 'proportion', '0', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
print join("\t", 'proportion', 'NA', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
my $s = 0;
print join("\t", 'cum_proportion', '0', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
print join("\t", 'cum_proportion', 'NA', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
my @buffer = ();
foreach my $row (@data_timings) {
my $str = join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_} || 0} @sorted_analysis_ids)."\n";
if ($row->[1]) {
if (@buffer) {
my $n = scalar(@buffer);
if ($n > $skip) {
splice(@buffer, int($skip / 2), $n-$skip);
}
foreach my $old_str (@buffer) {
print $old_str;
}
@buffer = ();
}
print $str;
} else {
push @buffer, $str;
}
print join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_}} @sorted_analysis_ids)."\n";
}
return;
}
# Get the number of analysis we want to display
my $n_relevant_analysis = 0;
if ($top and $top > 0) {
my $n_relevant_analysis = scalar(@sorted_analysis_ids);
if ($top and ($top > 0)) {
if ($top < 1) {
my $s = 0;
map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < .995 && $n_relevant_analysis++} @sorted_analysis_ids;
} else {
$n_relevant_analysis = $top
$n_relevant_analysis = 0;
map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < $top && $n_relevant_analysis++} @sorted_analysis_ids;
} elsif ($top < scalar(@sorted_analysis_ids)) {
$n_relevant_analysis = $top;
}
} else {
$n_relevant_analysis = scalar(@sorted_analysis_ids);
}
# cap based on the length of the palette
my $need_other_analysis = $n_relevant_analysis < scalar(@sorted_analysis_ids) ? 1 : 0;
if (($n_relevant_analysis+$need_other_analysis) > scalar(@palette)) {
$n_relevant_analysis = scalar(@palette) - 1;
$need_other_analysis = 1;
}
$top = $n_relevant_analysis unless $top;
warn $n_relevant_analysis;
my @xdata = map {$_->[0]} @data_timings;
my @datasets = ();
{
push @datasets, Chart::Gnuplot::DataSet->new(
xdata => [map {$_->[0]} @activity],
ydata => [map {$max_workers*(1-0.03*$_->[1])} @activity],
timefmt => '%Y-%m-%dT%H:%M:%S',
title => $activity_title,
style => sprintf('filledcurves below y1=%d', int($max_workers)),
linetype => '2',
linewidth => '0',
color => '#2F4F4F',
);
}
{
my $pseudo_zero_value = $logscale ? .8 : -$max_workers / 50;
# The background plot: the sum of all the analysis
if ($need_other_analysis) {
my @ydata = ();
foreach my $row (@data_timings) {
push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids );
push @ydata, sum(map {$row->[1]->{$_}} @sorted_analysis_ids ) || $pseudo_zero_value;
}
push @datasets, Chart::Gnuplot::DataSet->new(
xdata => \@xdata,
ydata => \@ydata,
timefmt => '%Y-%m-%dT%H:%M:%S',
title => 'OTHER',
style => 'filledcurves',
style => 'filledcurves x1',
linewidth => '0',
color => $palette[$n_relevant_analysis],
);
}
# Each analysis is plotted as the sum of itself and the top ones
foreach my $i (reverse 1..$n_relevant_analysis) {
my @ydata;
foreach my $row (@data_timings) {
push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] );
push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] ) || $pseudo_zero_value;
}
my $dataset = Chart::Gnuplot::DataSet->new(
xdata => \@xdata,
ydata => \@ydata,
timefmt => '%Y-%m-%dT%H:%M:%S',
title => $name{$sorted_analysis_ids[$i-1]},
style => 'filledcurves',
style => 'filledcurves x1',
linewidth => '0',
color => $palette[$i-1],
);
......@@ -238,14 +220,14 @@ sub main {
}
my $chart = Chart::Gnuplot->new(
title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " until $end_date" : ""),
title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " to $end_date" : ""),
timeaxis => 'x',
legend => {
position => 'outside right',
align => 'left',
},
xtics => {
labelfmt => '%b %d',
labelfmt => '%b %d\n %H:00',
},
bg => {
color => 'white',
......@@ -254,6 +236,8 @@ sub main {
output => $output,
terminal => $terminal_mapping{$gnuplot_terminal},
ylabel => 'Number of workers',
yrange => [$pseudo_zero_value, undef],
$logscale ? (logscale => 'y') : (),
);
$chart->plot2d(@datasets);
......@@ -280,22 +264,17 @@ __DATA__
You can optionally ask the script to generate an image with Gnuplot.
Please note the script runs a query for each interval (default: 5 minutes), which can take some time
for long-running pipelines.
=head1 USAGE EXAMPLES
# Just run it the usual way: only the top 19 analysis will be reported in CSV format
# Just run it the usual way: only the top 20 analysis will be reported in CSV format
generate_profile.pl -url mysql://username:secret@hostname:port/database > profile.csv
# The same, but getting the analysis that fill 99.5% of the global activity in a PNG file
generate_profile.pl -url mysql://username:secret@hostname:port/database -top .995 -output profile.png
# Assuming you are only interested in a precise interval (in a PNG file)
generate_profile.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -granularity 1 -output profile.png
# Assuming that the pipeline has large periods of inactivity
generate_profile.pl -url mysql://username:secret@hostname:port/database -granularity 10 -skip_no_activity 1 > profile.csv
generate_profile.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -output profile.png
=head1 OPTIONS
......@@ -303,9 +282,7 @@ __DATA__
-url <url string> : url defining where hive database is located
-start_date <date> : minimal start date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
-end_date <date> : maximal end date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
-granularity <int> : size of the intervals on which the activity is computed (minutes) (default: 5)
-skip_no_activity <int> : only for CSV output: shrink the periods of inactivity which are longer than "skip_no_activity" hours (default: 2)
-top <float> : maximum number (> 1) or fraction (< 1) of analysis to report (default: 19)
-top <float> : maximum number (> 1) or fraction (< 1) of analysis to report (default: 20)
-output <string> : output file: its extension must match one of the Gnuplot terminals. Otherwise, the CSV output is produced on stdout
=head1 CONTACT
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment