Gets the birth/death events instead of sampling the database. The "NOTHING"...

Gets the birth/death events instead of sampling the database. The "NOTHING" curve is not needed any more

Gets the birth/death events instead of sampling the database. The "NOTHING"...
Gets the birth/death events instead of sampling the database. The "NOTHING" curve is not needed any more
f731a413 · Matthieu Muffato · 819d3850 · f731a413
Commit f731a413 authored 11 years ago by Matthieu Muffato
--- a/scripts/generate_timeline.pl
+++ b/scripts/generate_timeline.pl
@@ -29,7 +29,7 @@ exit(0);

 sub main {

-    my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $start_date, $end_date, $granularity, $skip, $output, $top);
+    my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $start_date, $end_date, $output, $top, $logscale);

    GetOptions(
                # connect to the database:
@@ -41,9 +41,8 @@ sub main {

            'start_date=s'               => \$start_date,
            'end_date=s'                 => \$end_date,
-            'granularity=i'              => \$granularity,
-            'skip_no_activity=i'         => \$skip,
            'top=f'                      => \$top,
+            'log=i'                      => \$logscale,
            'output=s'                   => \$output,
            'h|help'                     => \$help,
    );
@@ -65,12 +64,7 @@ sub main {
    }

    # Palette generated with R: c(brewer.pal(9, "Set1"), brewer.pal(12, "Set3")). #FFFFB3 is removed because it is too close to white
-    my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999     #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F);
-
-    # Default options
-    $granularity = 5 unless $granularity;
-    $skip = int(($skip || 2*60) / $granularity);
-    $top = scalar(@palette)-1 unless $top;
+    my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999     #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F    #2F4F4F);

    my %terminal_mapping = (
        'emf' => 'emf',
@@ -89,148 +83,136 @@ sub main {

    }

-    my $activity_title = 'ACTIVITY';

    my $dbh = $hive_dba->dbc->db_handle();

-    my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
-    push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
-    my @birth_death_dates = sort {$a->[0] cmp $b->[0]} @tmp_dates;
+    # Get the events from the database
+    my %events = ();
+    {
+        my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
+        push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
+        warn scalar(@tmp_dates), " events\n";

-    warn scalar(@birth_death_dates), " events\n";
-    #die Dumper $birth_death_dates[0];
+        foreach my $db_entry (@tmp_dates) {
+            my ($event_date, $analysis_id, $offset) = @$db_entry;
+            $events{$event_date}{$analysis_id} += $offset;
+        }
+    }
+    my @event_dates = sort {$a cmp $b} (keys %events);
+    warn scalar(@event_dates), " dates\n";

    my $sql_analysis_names = 'SELECT analysis_id, logic_name FROM analysis_base';
-    my $data = $dbh->selectall_arrayref($sql_analysis_names);
-    my %name = (map {$_->[0] => $_->[1] } @$data);
-
-    #die Dumper \%name;
+    my @analysis_data = @{$dbh->selectall_arrayref($sql_analysis_names)};
+    my %name = (map {$_->[0] => $_->[1] } @analysis_data);
+    warn scalar(@analysis_data), " analysis\n";

    my $max_workers = 0;
    my @data_timings = ();
    my %tot_analysis = ();

-    my $sum_a = 0;
-    my %tmp_interval;
-    my @activity;
-    while (scalar(@birth_death_dates)) {
+    my $num_curr_workers = 0;
+    my %hash_curr_workers = (map {$_->[0] => 0 } @analysis_data);
+
+    foreach my $event_date (@event_dates) {

-        my ($event_date, $analysis_id, $offset) = @{shift @birth_death_dates};
-        last if $event_date gt $end_date;
+        last if $end_date and ($event_date gt $end_date);

-        $tmp_interval{$analysis_id} += $offset;
-        $sum_a += $offset;
-        my %hash_interval = %tmp_interval;
+        my $topup_hash = $events{$event_date};
+        foreach my $analysis_id (keys %$topup_hash) {
+            $hash_curr_workers{$analysis_id} += $topup_hash->{$analysis_id};
+            $num_curr_workers += $topup_hash->{$analysis_id};
+        }
+        die if sum(values %hash_curr_workers) != $num_curr_workers;
+
+        next if $start_date and ($event_date lt $start_date);
+
+        my %hash_interval = %hash_curr_workers;
+        #FIXME It should be normalised by the length of the time interval
        map {$tot_analysis{$_} += $hash_interval{$_}} keys %hash_interval;

-        $max_workers = $sum_a if ($sum_a > $max_workers);
-        next if $event_date lt $start_date;
+        $max_workers = $num_curr_workers if ($num_curr_workers > $max_workers);

-        # We can store the data
+        # We need to repeat the previous value to have an histogram shape
+        push @data_timings, [$event_date, $data_timings[-1]->[1]] if @data_timings;
        push @data_timings, [$event_date, \%hash_interval];
-        unless ($sum_a) {
-            push @activity, [$event_date, 1];
-            push @activity, [$event_date, 0];
-            push @activity, [$birth_death_dates[0]->[0], 0] if scalar(@birth_death_dates);
-            push @activity, [$birth_death_dates[0]->[0], 1] if scalar(@birth_death_dates);
-        }
    }
    warn $max_workers;
    warn Dumper \%tot_analysis;

    my $total_total = sum(values %tot_analysis);

-    my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $name{$a} cmp lc $name{$b})} keys %tot_analysis;
+    my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $name{$a} cmp lc $name{$b})} (grep {$tot_analysis{$_}} keys %tot_analysis);
    warn Dumper \@sorted_analysis_ids;
    warn Dumper([map {$name{$_}} @sorted_analysis_ids]);

    if (not $gnuplot_terminal) {
-        print join("\t", 'analysis', $activity_title, map {$name{$_}} @sorted_analysis_ids), "\n";
+        print join("\t", 'date', 'OVERALL', map {$name{$_}} @sorted_analysis_ids), "\n";
        print join("\t", 'total', $total_total, map {$tot_analysis{$_}} @sorted_analysis_ids), "\n";
-        print join("\t", 'proportion', '0', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
+        print join("\t", 'proportion', 'NA', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
        my $s = 0;
-        print join("\t", 'cum_proportion', '0', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
+        print join("\t", 'cum_proportion', 'NA', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";

-        my @buffer = ();
        foreach my $row (@data_timings) {
-            my $str = join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_} || 0} @sorted_analysis_ids)."\n";
-            if ($row->[1]) {
-                if (@buffer) {
-                    my $n = scalar(@buffer);
-                    if ($n > $skip) {
-                        splice(@buffer, int($skip / 2), $n-$skip);
-                    }
-                    foreach my $old_str (@buffer) {
-                        print $old_str;
-                    }
-                    @buffer = ();
-                }
-                print $str;
-            } else {
-                push @buffer, $str;
-            }
+            print join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_}} @sorted_analysis_ids)."\n";
        }
        return;
    }

    # Get the number of analysis we want to display
-    my $n_relevant_analysis = 0;
-    if ($top and $top > 0) {
+    my $n_relevant_analysis = scalar(@sorted_analysis_ids);
+    if ($top and ($top > 0)) {
        if ($top < 1) {
            my $s = 0;
-            map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < .995 && $n_relevant_analysis++} @sorted_analysis_ids;
-        } else {
-            $n_relevant_analysis = $top
+            $n_relevant_analysis = 0;
+            map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < $top && $n_relevant_analysis++} @sorted_analysis_ids;
+        } elsif ($top < scalar(@sorted_analysis_ids)) {
+            $n_relevant_analysis = $top;
        }
-    } else {
-        $n_relevant_analysis = scalar(@sorted_analysis_ids);
    }
-
+    # cap based on the length of the palette
+    my $need_other_analysis = $n_relevant_analysis < scalar(@sorted_analysis_ids) ? 1 : 0;
+    if (($n_relevant_analysis+$need_other_analysis) > scalar(@palette)) {
+        $n_relevant_analysis = scalar(@palette) - 1;
+        $need_other_analysis = 1;
+    }
+    $top = $n_relevant_analysis unless $top;
    warn $n_relevant_analysis;

    my @xdata = map {$_->[0]} @data_timings;

    my @datasets = ();

-    {
-        push @datasets, Chart::Gnuplot::DataSet->new(
-            xdata => [map {$_->[0]} @activity],
-            ydata => [map {$max_workers*(1-0.03*$_->[1])} @activity],
-            timefmt => '%Y-%m-%dT%H:%M:%S',
-            title => $activity_title,
-            style => sprintf('filledcurves below y1=%d', int($max_workers)),
-            linetype => '2',
-            linewidth => '0',
-            color => '#2F4F4F',
-        );
-    }
-    {
+    my $pseudo_zero_value = $logscale ? .8 : -$max_workers / 50;
+
+    # The background plot: the sum of all the analysis
+    if ($need_other_analysis) {
        my @ydata = ();
        foreach my $row (@data_timings) {
-            push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids );
+            push @ydata, sum(map {$row->[1]->{$_}} @sorted_analysis_ids ) || $pseudo_zero_value;
        }
        push @datasets, Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
            title => 'OTHER',
-            style => 'filledcurves',
+            style => 'filledcurves x1',
            linewidth => '0',
            color => $palette[$n_relevant_analysis],
        );
    }

+    # Each analysis is plotted as the sum of itself and the top ones
    foreach my $i (reverse 1..$n_relevant_analysis) {
        my @ydata;
        foreach my $row (@data_timings) {
-            push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] );
+            push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] ) || $pseudo_zero_value;
        }
        my $dataset = Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
            title => $name{$sorted_analysis_ids[$i-1]},
-            style => 'filledcurves',
+            style => 'filledcurves x1',
            linewidth => '0',
            color => $palette[$i-1],
        );
@@ -238,14 +220,14 @@ sub main {
    }

    my $chart = Chart::Gnuplot->new(
-        title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " until $end_date" : ""),
+        title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " to $end_date" : ""),
        timeaxis => 'x',
        legend => {
            position => 'outside right',
            align => 'left',
        },
        xtics => {
-            labelfmt => '%b %d',
+            labelfmt => '%b %d\n %H:00',
        },
        bg => {
            color => 'white',
@@ -254,6 +236,8 @@ sub main {
        output => $output,
        terminal => $terminal_mapping{$gnuplot_terminal},
        ylabel => 'Number of workers',
+        yrange => [$pseudo_zero_value, undef],
+        $logscale ? (logscale => 'y') : (),
    );
    $chart->plot2d(@datasets);

@@ -280,22 +264,17 @@ __DATA__

    You can optionally ask the script to generate an image with Gnuplot.

-    Please note the script runs a query for each interval (default: 5 minutes), which can take some time
-    for long-running pipelines.

 =head1 USAGE EXAMPLES

-        # Just run it the usual way: only the top 19 analysis will be reported in CSV format
+        # Just run it the usual way: only the top 20 analysis will be reported in CSV format
    generate_profile.pl -url mysql://username:secret@hostname:port/database > profile.csv

        # The same, but getting the analysis that fill 99.5% of the global activity in a PNG file
    generate_profile.pl -url mysql://username:secret@hostname:port/database -top .995 -output profile.png

        # Assuming you are only interested in a precise interval (in a PNG file)
-    generate_profile.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -granularity 1 -output profile.png
-
-        # Assuming that the pipeline has large periods of inactivity
-    generate_profile.pl -url mysql://username:secret@hostname:port/database -granularity 10 -skip_no_activity 1 > profile.csv
+    generate_profile.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -output profile.png

 =head1 OPTIONS

@@ -303,9 +282,7 @@ __DATA__
    -url <url string>       : url defining where hive database is located
    -start_date <date>      : minimal start date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
    -end_date <date>        : maximal end date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
-    -granularity <int>      : size of the intervals on which the activity is computed (minutes) (default: 5)
-    -skip_no_activity <int> : only for CSV output: shrink the periods of inactivity which are longer than "skip_no_activity" hours (default: 2)
-    -top <float>            : maximum number (> 1) or fraction (< 1) of analysis to report (default: 19)
+    -top <float>            : maximum number (> 1) or fraction (< 1) of analysis to report (default: 20)
    -output <string>        : output file: its extension must match one of the Gnuplot terminals. Otherwise, the CSV output is produced on stdout

 =head1 CONTACT