generate_timeline.pl 13.3 KB
Newer Older
1
2
#!/usr/bin/env perl

Matthieu Muffato's avatar
Matthieu Muffato committed
3
# Gets the activity of each analysis along time, in a CSV file or in an image (see list of formats supported by GNUplot)
4
5
6
7
8
9
10
11
12
13
14
15
16
17

use strict;
use warnings;

    # Finding out own path in order to reference own components (including own modules):
use Cwd            ();
use File::Basename ();
BEGIN {
    $ENV{'EHIVE_ROOT_DIR'} = File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) );
    unshift @INC, $ENV{'EHIVE_ROOT_DIR'}.'/modules';
}


use Getopt::Long;
18
use List::Util qw(sum);
19
use POSIX;
20
use Data::Dumper;
21
22
23
24

use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::Utils ('script_usage');

25
26
no warnings qw{qw};

27
28
29
30
31
main();
exit(0);

sub main {

32
    my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $mode, $start_date, $end_date, $output, $top, $logscale, $default_memory, $default_cores);
33
34
35
36
37
38
39
40
41
42
43

    GetOptions(
                # connect to the database:
            'url=s'                      => \$url,
            'reg_conf|regfile=s'         => \$reg_conf,
            'reg_type=s'                 => \$reg_type,
            'reg_alias|regname=s'        => \$reg_alias,
            'nosqlvc=i'                  => \$nosqlvc,      # using "=i" instead of "!" for consistency with scripts where it is a propagated option

            'start_date=s'               => \$start_date,
            'end_date=s'                 => \$end_date,
44
            'mode=s'                     => \$mode,
45
            'top=f'                      => \$top,
46
            'log=i'                      => \$logscale,
47
            'mem=i'                      => \$default_memory,
48
            'n_core=i'                     => \$default_cores,
49
            'output=s'                   => \$output,
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
            'h|help'                     => \$help,
    );

    if ($help) { script_usage(0); }

    my $hive_dba;
    if($url or $reg_alias) {
        $hive_dba = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(
                -url                            => $url,
                -reg_conf                       => $reg_conf,
                -reg_type                       => $reg_type,
                -reg_alias                      => $reg_alias,
                -no_sql_schema_version_check    => $nosqlvc,
        );
    } else {
        warn "\nERROR: Connection parameters (url or reg_conf+reg_alias) need to be specified\n";
        script_usage(1);
    }

69
70
71
72
    # Check whether $mode is valid
    my %allowed_modes = (
        workers => 'Number of workers',
        memory => 'Memory asked (Gb)',
73
        cores => 'Number of CPU cores',
74
75
76
77
    );
    if ($mode) {
        die "Unknown mode '$mode'. Allowed modes are: ".join(", ", keys %allowed_modes) unless exists $allowed_modes{$mode};
        $default_memory = 100 unless $default_memory;
78
        $default_cores = 1 unless $default_cores;
79
80
81
82
    } else {
        $mode = 'workers';
    }

83
    # Palette generated with R: c(brewer.pal(9, "Set1"), brewer.pal(12, "Set3")). #FFFFB3 is removed because it is too close to white
84
    my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999     #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F    #2F4F4F);
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

    my %terminal_mapping = (
        'emf' => 'emf',
        'png' => 'png',
        'svg' => 'svg',
        'jpg' => 'jpeg',
        'gif' => 'gif',
        'ps'  => 'postscript eps enhanced color',
        'pdf' => 'pdf color enhanced',
    );
    my $gnuplot_terminal = undef;
    if ($output and $output =~ /\.(\w+)$/) {
        $gnuplot_terminal = $1;
        die "The format '$gnuplot_terminal' is not currently supported." if not exists $terminal_mapping{$gnuplot_terminal};
        require Chart::Gnuplot;
100

101
    }
102
103


104
    my $dbh = $hive_dba->dbc->db_handle();
105

106
107
    # Get the memory usage from each resource_class
    my %mem_resources = ();
108
    my %cpu_resources = ();
109
110
111
112
113
114
    {
        my $sql_resource_descriptions = 'SELECT resource_class_id, meadow_type, submission_cmd_args FROM resource_description';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_resource_descriptions)}) {
            my ($resource_class_id, $meadow_type, $submission_cmd_args) = @$db_entry;
            if ($meadow_type eq 'LSF') {
                $mem_resources{$resource_class_id} = $1 if $submission_cmd_args =~ m/mem=(\d+)/;
115
                $cpu_resources{$resource_class_id} = $1 if $submission_cmd_args =~ m/-n\s*(\d+)/;
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
            }
        }
    }

    # Get the info about the analysis
    my %default_resource_class = ();
    my %analysis_name = ();
    {
        my $sql_analysis_info = 'SELECT analysis_id, logic_name, resource_class_id FROM analysis_base';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_analysis_info)}) {
            my ($analysis_id, $logic_name, $resource_class_id) = @$db_entry;
            $analysis_name{$analysis_id} = $logic_name;
            $default_resource_class{$analysis_id} = $resource_class_id;
        }
    }
    warn "default_resource_class: ", Dumper \%default_resource_class;
    warn "analysis_name: ", Dumper \%analysis_name;
    warn scalar(keys %analysis_name), " analysis\n";

135
136
137
    # Get the events from the database
    my %events = ();
    {
138
139
        my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, resource_class_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
        push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, resource_class_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
140
        warn scalar(@tmp_dates), " events\n";
141

142
        foreach my $db_entry (@tmp_dates) {
143
144
145
146
            my ($event_date, $analysis_id, $resource_class_id, $offset) = @$db_entry;
            $resource_class_id = $default_resource_class{$analysis_id} unless $resource_class_id;
            if ($mode eq 'workers') {
                $events{$event_date}{$analysis_id} += $offset;
147
            } elsif ($mode eq 'memory') {
148
                $events{$event_date}{$analysis_id} += $offset * ($mem_resources{$resource_class_id} || $default_memory) / 1024.;
149
150
            } else {
                $events{$event_date}{$analysis_id} += $offset * ($cpu_resources{$resource_class_id} || $default_cores);
151
            }
152
153
154
155
        }
    }
    my @event_dates = sort {$a cmp $b} (keys %events);
    warn scalar(@event_dates), " dates\n";
156
157
158
159
160

    my $max_workers = 0;
    my @data_timings = ();
    my %tot_analysis = ();

161
    my $num_curr_workers = 0;
162
    my %hash_curr_workers = (map {$_ => 0 } (keys %analysis_name));
163
164

    foreach my $event_date (@event_dates) {
165

166
        last if $end_date and ($event_date gt $end_date);
167

168
169
170
171
172
173
174
175
176
177
178
        my $topup_hash = $events{$event_date};
        foreach my $analysis_id (keys %$topup_hash) {
            $hash_curr_workers{$analysis_id} += $topup_hash->{$analysis_id};
            $num_curr_workers += $topup_hash->{$analysis_id};
        }
        die if sum(values %hash_curr_workers) != $num_curr_workers;

        next if $start_date and ($event_date lt $start_date);

        my %hash_interval = %hash_curr_workers;
        #FIXME It should be normalised by the length of the time interval
179
180
        map {$tot_analysis{$_} += $hash_interval{$_}} keys %hash_interval;

181
        $max_workers = $num_curr_workers if ($num_curr_workers > $max_workers);
182

183
184
        # We need to repeat the previous value to have an histogram shape
        push @data_timings, [$event_date, $data_timings[-1]->[1]] if @data_timings;
185
        push @data_timings, [$event_date, \%hash_interval];
186
187
    }
    warn $max_workers;
188
    warn Dumper \%tot_analysis;
189

190
191
    my $total_total = sum(values %tot_analysis);

192
    my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $analysis_name{$a} cmp lc $analysis_name{$b})} (grep {$tot_analysis{$_}} keys %tot_analysis);
193
    warn Dumper \@sorted_analysis_ids;
194
    warn Dumper([map {$analysis_name{$_}} @sorted_analysis_ids]);
195

196
    if (not $gnuplot_terminal) {
197
        print join("\t", 'date', "OVERALL_$mode", map {$analysis_name{$_}} @sorted_analysis_ids), "\n";
198
        print join("\t", 'total', $total_total, map {$tot_analysis{$_}} @sorted_analysis_ids), "\n";
199
        print join("\t", 'proportion', 'NA', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
200
        my $s = 0;
201
        print join("\t", 'cum_proportion', 'NA', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
202
203

        foreach my $row (@data_timings) {
204
            print join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_}} @sorted_analysis_ids)."\n";
205
        }
206
        return;
207
208
    }

209
    # Get the number of analysis we want to display
210
211
    my $n_relevant_analysis = scalar(@sorted_analysis_ids);
    if ($top and ($top > 0)) {
212
213
        if ($top < 1) {
            my $s = 0;
214
215
216
217
            $n_relevant_analysis = 0;
            map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < $top && $n_relevant_analysis++} @sorted_analysis_ids;
        } elsif ($top < scalar(@sorted_analysis_ids)) {
            $n_relevant_analysis = $top;
218
219
        }
    }
220
221
222
223
224
225
226
    # cap based on the length of the palette
    my $need_other_analysis = $n_relevant_analysis < scalar(@sorted_analysis_ids) ? 1 : 0;
    if (($n_relevant_analysis+$need_other_analysis) > scalar(@palette)) {
        $n_relevant_analysis = scalar(@palette) - 1;
        $need_other_analysis = 1;
    }
    $top = $n_relevant_analysis unless $top;
227
    warn $n_relevant_analysis;
228
229
230
231
232

    my @xdata = map {$_->[0]} @data_timings;

    my @datasets = ();

233
234
235
236
    my $pseudo_zero_value = $logscale ? .8 : -$max_workers / 50;

    # The background plot: the sum of all the analysis
    if ($need_other_analysis) {
237
238
        my @ydata = ();
        foreach my $row (@data_timings) {
239
            push @ydata, sum(map {$row->[1]->{$_}} @sorted_analysis_ids ) || $pseudo_zero_value;
240
241
242
243
244
245
        }
        push @datasets, Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
            title => 'OTHER',
246
            style => 'filledcurves x1',
247
248
249
            linewidth => '0',
            color => $palette[$n_relevant_analysis],
        );
250
    }
251

252
    # Each analysis is plotted as the sum of itself and the top ones
253
    foreach my $i (reverse 1..$n_relevant_analysis) {
254
255
        my @ydata;
        foreach my $row (@data_timings) {
256
            push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] ) || $pseudo_zero_value;
257
258
259
260
261
        }
        my $dataset = Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
262
            title => $analysis_name{$sorted_analysis_ids[$i-1]},
263
            style => 'filledcurves x1',
264
265
266
267
            linewidth => '0',
            color => $palette[$i-1],
        );
        push @datasets, $dataset;
268
    }
269
270

    my $chart = Chart::Gnuplot->new(
271
        title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " to $end_date" : ""),
272
273
274
275
276
277
        timeaxis => 'x',
        legend => {
            position => 'outside right',
            align => 'left',
        },
        xtics => {
278
            labelfmt => '%b %d\n %H:00',
279
280
281
282
283
284
285
        },
        bg => {
            color => 'white',
        },
        imagesize => '1400, 800',
        output => $output,
        terminal => $terminal_mapping{$gnuplot_terminal},
286
        ylabel => $allowed_modes{$mode},
287
288
        yrange => [$pseudo_zero_value, undef],
        $logscale ? (logscale => 'y') : (),
289
290
    );
    $chart->plot2d(@datasets);
291
292
293
294
295
296
297
298
299
300
301

}



__DATA__

=pod

=head1 NAME

Matthieu Muffato's avatar
Matthieu Muffato committed
302
    generate_timeline.pl
303
304
305

=head1 DESCRIPTION

Matthieu Muffato's avatar
Matthieu Muffato committed
306
    This script is used for offline examination of the allocation of workers.
307
308
309

    Based on the command-line parameters 'start_date' and 'end_date', or on the start time of the first
    worker and end time of the last worker (as recorded in pipeline DB), it pulls the relevant data out
Matthieu Muffato's avatar
Matthieu Muffato committed
310
    of the 'worker' table for accurate timing.
311
    By default, the output is in CSV format, to allow extra analysis to be carried.
312

Matthieu Muffato's avatar
Matthieu Muffato committed
313
    You can optionally ask the script to generate an image with Gnuplot.
314
315
316
317


=head1 USAGE EXAMPLES

318
        # Just run it the usual way: only the top 20 analysis will be reported in CSV format
Matthieu Muffato's avatar
Matthieu Muffato committed
319
    generate_timeline.pl -url mysql://username:secret@hostname:port/database > timeline.csv
320

Matthieu Muffato's avatar
Matthieu Muffato committed
321
        # The same, but getting the analysis that fill 99.5% of the global activity in a PNG file
Matthieu Muffato's avatar
Matthieu Muffato committed
322
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -top .995 -output timeline.png
323

Matthieu Muffato's avatar
Matthieu Muffato committed
324
        # Assuming you are only interested in a precise interval (in a PNG file)
Matthieu Muffato's avatar
Matthieu Muffato committed
325
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -output timeline.png
326
327
328
329
330

=head1 OPTIONS

    -help                   : print this help
    -url <url string>       : url defining where hive database is located
Matthieu Muffato's avatar
Matthieu Muffato committed
331
332
    -start_date <date>      : minimal start date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
    -end_date <date>        : maximal end date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
333
    -top <float>            : maximum number (> 1) or fraction (< 1) of analysis to report (default: 20)
Matthieu Muffato's avatar
Matthieu Muffato committed
334
    -output <string>        : output file: its extension must match one of the Gnuplot terminals. Otherwise, the CSV output is produced on stdout
335
336
337
338
339
340
341

=head1 CONTACT

    Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.

=cut