Commit f731a413 authored by Matthieu Muffato's avatar Matthieu Muffato
Browse files

Gets the birth/death events instead of sampling the database. The "NOTHING"...

Gets the birth/death events instead of sampling the database. The "NOTHING" curve is not needed any more
parent 819d3850
......@@ -29,7 +29,7 @@ exit(0);
sub main {
my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $start_date, $end_date, $granularity, $skip, $output, $top);
my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $start_date, $end_date, $output, $top, $logscale);
GetOptions(
# connect to the database:
......@@ -41,9 +41,8 @@ sub main {
'start_date=s' => \$start_date,
'end_date=s' => \$end_date,
'granularity=i' => \$granularity,
'skip_no_activity=i' => \$skip,
'top=f' => \$top,
'log=i' => \$logscale,
'output=s' => \$output,
'h|help' => \$help,
);
......@@ -65,12 +64,7 @@ sub main {
}
# Palette generated with R: c(brewer.pal(9, "Set1"), brewer.pal(12, "Set3")). #FFFFB3 is removed because it is too close to white
my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999 #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F);
# Default options
$granularity = 5 unless $granularity;
$skip = int(($skip || 2*60) / $granularity);
$top = scalar(@palette)-1 unless $top;
my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999 #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F #2F4F4F);
my %terminal_mapping = (
'emf' => 'emf',
......@@ -89,148 +83,136 @@ sub main {
}
my $activity_title = 'ACTIVITY';
my $dbh = $hive_dba->dbc->db_handle();
my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
my @birth_death_dates = sort {$a->[0] cmp $b->[0]} @tmp_dates;
# Get the events from the database
my %events = ();
{
my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
warn scalar(@tmp_dates), " events\n";
warn scalar(@birth_death_dates), " events\n";
#die Dumper $birth_death_dates[0];
foreach my $db_entry (@tmp_dates) {
my ($event_date, $analysis_id, $offset) = @$db_entry;
$events{$event_date}{$analysis_id} += $offset;
}
}
my @event_dates = sort {$a cmp $b} (keys %events);
warn scalar(@event_dates), " dates\n";
my $sql_analysis_names = 'SELECT analysis_id, logic_name FROM analysis_base';
my $data = $dbh->selectall_arrayref($sql_analysis_names);
my %name = (map {$_->[0] => $_->[1] } @$data);
#die Dumper \%name;
my @analysis_data = @{$dbh->selectall_arrayref($sql_analysis_names)};
my %name = (map {$_->[0] => $_->[1] } @analysis_data);
warn scalar(@analysis_data), " analysis\n";
my $max_workers = 0;
my @data_timings = ();
my %tot_analysis = ();
my $sum_a = 0;
my %tmp_interval;
my @activity;
while (scalar(@birth_death_dates)) {
my $num_curr_workers = 0;
my %hash_curr_workers = (map {$_->[0] => 0 } @analysis_data);
foreach my $event_date (@event_dates) {
my ($event_date, $analysis_id, $offset) = @{shift @birth_death_dates};
last if $event_date gt $end_date;
last if $end_date and ($event_date gt $end_date);
$tmp_interval{$analysis_id} += $offset;
$sum_a += $offset;
my %hash_interval = %tmp_interval;
my $topup_hash = $events{$event_date};
foreach my $analysis_id (keys %$topup_hash) {
$hash_curr_workers{$analysis_id} += $topup_hash->{$analysis_id};
$num_curr_workers += $topup_hash->{$analysis_id};
}
die if sum(values %hash_curr_workers) != $num_curr_workers;
next if $start_date and ($event_date lt $start_date);
my %hash_interval = %hash_curr_workers;
#FIXME It should be normalised by the length of the time interval
map {$tot_analysis{$_} += $hash_interval{$_}} keys %hash_interval;
$max_workers = $sum_a if ($sum_a > $max_workers);
next if $event_date lt $start_date;
$max_workers = $num_curr_workers if ($num_curr_workers > $max_workers);
# We can store the data
# We need to repeat the previous value to have an histogram shape
push @data_timings, [$event_date, $data_timings[-1]->[1]] if @data_timings;
push @data_timings, [$event_date, \%hash_interval];
unless ($sum_a) {
push @activity, [$event_date, 1];
push @activity, [$event_date, 0];
push @activity, [$birth_death_dates[0]->[0], 0] if scalar(@birth_death_dates);
push @activity, [$birth_death_dates[0]->[0], 1] if scalar(@birth_death_dates);
}
}
warn $max_workers;
warn Dumper \%tot_analysis;
my $total_total = sum(values %tot_analysis);
my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $name{$a} cmp lc $name{$b})} keys %tot_analysis;
my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $name{$a} cmp lc $name{$b})} (grep {$tot_analysis{$_}} keys %tot_analysis);
warn Dumper \@sorted_analysis_ids;
warn Dumper([map {$name{$_}} @sorted_analysis_ids]);
if (not $gnuplot_terminal) {
print join("\t", 'analysis', $activity_title, map {$name{$_}} @sorted_analysis_ids), "\n";
print join("\t", 'date', 'OVERALL', map {$name{$_}} @sorted_analysis_ids), "\n";
print join("\t", 'total', $total_total, map {$tot_analysis{$_}} @sorted_analysis_ids), "\n";
print join("\t", 'proportion', '0', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
print join("\t", 'proportion', 'NA', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
my $s = 0;
print join("\t", 'cum_proportion', '0', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
print join("\t", 'cum_proportion', 'NA', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
my @buffer = ();
foreach my $row (@data_timings) {
my $str = join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_} || 0} @sorted_analysis_ids)."\n";
if ($row->[1]) {
if (@buffer) {
my $n = scalar(@buffer);
if ($n > $skip) {
splice(@buffer, int($skip / 2), $n-$skip);
}
foreach my $old_str (@buffer) {
print $old_str;
}
@buffer = ();
}
print $str;
} else {
push @buffer, $str;
}
print join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_}} @sorted_analysis_ids)."\n";
}
return;
}
# Get the number of analysis we want to display
my $n_relevant_analysis = 0;
if ($top and $top > 0) {
my $n_relevant_analysis = scalar(@sorted_analysis_ids);
if ($top and ($top > 0)) {
if ($top < 1) {
my $s = 0;
map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < .995 && $n_relevant_analysis++} @sorted_analysis_ids;
} else {
$n_relevant_analysis = $top
$n_relevant_analysis = 0;
map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < $top && $n_relevant_analysis++} @sorted_analysis_ids;
} elsif ($top < scalar(@sorted_analysis_ids)) {
$n_relevant_analysis = $top;
}
} else {
$n_relevant_analysis = scalar(@sorted_analysis_ids);
}
# cap based on the length of the palette
my $need_other_analysis = $n_relevant_analysis < scalar(@sorted_analysis_ids) ? 1 : 0;
if (($n_relevant_analysis+$need_other_analysis) > scalar(@palette)) {
$n_relevant_analysis = scalar(@palette) - 1;
$need_other_analysis = 1;
}
$top = $n_relevant_analysis unless $top;
warn $n_relevant_analysis;
my @xdata = map {$_->[0]} @data_timings;
my @datasets = ();
{
push @datasets, Chart::Gnuplot::DataSet->new(
xdata => [map {$_->[0]} @activity],
ydata => [map {$max_workers*(1-0.03*$_->[1])} @activity],
timefmt => '%Y-%m-%dT%H:%M:%S',
title => $activity_title,
style => sprintf('filledcurves below y1=%d', int($max_workers)),
linetype => '2',
linewidth => '0',
color => '#2F4F4F',
);
}
{
my $pseudo_zero_value = $logscale ? .8 : -$max_workers / 50;
# The background plot: the sum of all the analysis
if ($need_other_analysis) {
my @ydata = ();
foreach my $row (@data_timings) {
push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids );
push @ydata, sum(map {$row->[1]->{$_}} @sorted_analysis_ids ) || $pseudo_zero_value;
}
push @datasets, Chart::Gnuplot::DataSet->new(
xdata => \@xdata,
ydata => \@ydata,
timefmt => '%Y-%m-%dT%H:%M:%S',
title => 'OTHER',
style => 'filledcurves',
style => 'filledcurves x1',
linewidth => '0',
color => $palette[$n_relevant_analysis],
);
}
# Each analysis is plotted as the sum of itself and the top ones
foreach my $i (reverse 1..$n_relevant_analysis) {
my @ydata;
foreach my $row (@data_timings) {
push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] );
push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] ) || $pseudo_zero_value;
}
my $dataset = Chart::Gnuplot::DataSet->new(
xdata => \@xdata,
ydata => \@ydata,
timefmt => '%Y-%m-%dT%H:%M:%S',
title => $name{$sorted_analysis_ids[$i-1]},
style => 'filledcurves',
style => 'filledcurves x1',
linewidth => '0',
color => $palette[$i-1],
);
......@@ -238,14 +220,14 @@ sub main {
}
my $chart = Chart::Gnuplot->new(
title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " until $end_date" : ""),
title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " to $end_date" : ""),
timeaxis => 'x',
legend => {
position => 'outside right',
align => 'left',
},
xtics => {
labelfmt => '%b %d',
labelfmt => '%b %d\n %H:00',
},
bg => {
color => 'white',
......@@ -254,6 +236,8 @@ sub main {
output => $output,
terminal => $terminal_mapping{$gnuplot_terminal},
ylabel => 'Number of workers',
yrange => [$pseudo_zero_value, undef],
$logscale ? (logscale => 'y') : (),
);
$chart->plot2d(@datasets);
......@@ -280,22 +264,17 @@ __DATA__
You can optionally ask the script to generate an image with Gnuplot.
Please note the script runs a query for each interval (default: 5 minutes), which can take some time
for long-running pipelines.
=head1 USAGE EXAMPLES
# Just run it the usual way: only the top 19 analysis will be reported in CSV format
# Just run it the usual way: only the top 20 analysis will be reported in CSV format
generate_profile.pl -url mysql://username:secret@hostname:port/database > profile.csv
# The same, but getting the analysis that fill 99.5% of the global activity in a PNG file
generate_profile.pl -url mysql://username:secret@hostname:port/database -top .995 -output profile.png
# Assuming you are only interested in a precise interval (in a PNG file)
generate_profile.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -granularity 1 -output profile.png
# Assuming that the pipeline has large periods of inactivity
generate_profile.pl -url mysql://username:secret@hostname:port/database -granularity 10 -skip_no_activity 1 > profile.csv
generate_profile.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -output profile.png
=head1 OPTIONS
......@@ -303,9 +282,7 @@ __DATA__
-url <url string> : url defining where hive database is located
-start_date <date> : minimal start date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
-end_date <date> : maximal end date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
-granularity <int> : size of the intervals on which the activity is computed (minutes) (default: 5)
-skip_no_activity <int> : only for CSV output: shrink the periods of inactivity which are longer than "skip_no_activity" hours (default: 2)
-top <float> : maximum number (> 1) or fraction (< 1) of analysis to report (default: 19)
-top <float> : maximum number (> 1) or fraction (< 1) of analysis to report (default: 20)
-output <string> : output file: its extension must match one of the Gnuplot terminals. Otherwise, the CSV output is produced on stdout
=head1 CONTACT
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment