generate_timeline.pl 16 KB
Newer Older
1
2
#!/usr/bin/env perl

Matthieu Muffato's avatar
Matthieu Muffato committed
3
# Gets the activity of each analysis along time, in a CSV file or in an image (see list of formats supported by GNUplot)
4
5
6
7
8
9
10
11
12
13
14
15
16
17

use strict;
use warnings;

    # Finding out own path in order to reference own components (including own modules):
use Cwd            ();
use File::Basename ();
BEGIN {
    $ENV{'EHIVE_ROOT_DIR'} = File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) );
    unshift @INC, $ENV{'EHIVE_ROOT_DIR'}.'/modules';
}


use Getopt::Long;
18
use List::Util qw(sum);
19
use POSIX;
20
use Data::Dumper;
21
22
23
24

use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::Utils ('script_usage');

25
26
no warnings qw{qw};

27
28
29
30
31
main();
exit(0);

sub main {

Matthieu Muffato's avatar
Matthieu Muffato committed
32
    my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $verbose, $mode, $start_date, $end_date, $output, $top, $logscale, $default_memory, $default_cores);
33
34
35
36
37
38
39
40
41

    GetOptions(
                # connect to the database:
            'url=s'                      => \$url,
            'reg_conf|regfile=s'         => \$reg_conf,
            'reg_type=s'                 => \$reg_type,
            'reg_alias|regname=s'        => \$reg_alias,
            'nosqlvc=i'                  => \$nosqlvc,      # using "=i" instead of "!" for consistency with scripts where it is a propagated option

Matthieu Muffato's avatar
Matthieu Muffato committed
42
43
44
            'verbose!'                   => \$verbose,
            'h|help'                     => \$help,

45
46
            'start_date=s'               => \$start_date,
            'end_date=s'                 => \$end_date,
47
            'mode=s'                     => \$mode,
48
            'top=f'                      => \$top,
49
            'log=i'                      => \$logscale,
50
            'mem=i'                      => \$default_memory,
Matthieu Muffato's avatar
Matthieu Muffato committed
51
            'n_core=i'                   => \$default_cores,
52
            'output=s'                   => \$output,
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    );

    if ($help) { script_usage(0); }

    my $hive_dba;
    if($url or $reg_alias) {
        $hive_dba = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(
                -url                            => $url,
                -reg_conf                       => $reg_conf,
                -reg_type                       => $reg_type,
                -reg_alias                      => $reg_alias,
                -no_sql_schema_version_check    => $nosqlvc,
        );
    } else {
        warn "\nERROR: Connection parameters (url or reg_conf+reg_alias) need to be specified\n";
        script_usage(1);
    }

71
72
73
74
    # Check whether $mode is valid
    my %allowed_modes = (
        workers => 'Number of workers',
        memory => 'Memory asked (Gb)',
75
        cores => 'Number of CPU cores',
76
        wasted_memory => 'Wasted memory (Gb)',
77
78
79
80
    );
    if ($mode) {
        die "Unknown mode '$mode'. Allowed modes are: ".join(", ", keys %allowed_modes) unless exists $allowed_modes{$mode};
        $default_memory = 100 unless $default_memory;
81
        $default_cores = 1 unless $default_cores;
82
83
84
85
    } else {
        $mode = 'workers';
    }

86
    # Palette generated with R: c(brewer.pal(9, "Set1"), brewer.pal(12, "Set3")). #FFFFB3 is removed because it is too close to white
87
    my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999     #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F    #2F4F4F);
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

    my %terminal_mapping = (
        'emf' => 'emf',
        'png' => 'png',
        'svg' => 'svg',
        'jpg' => 'jpeg',
        'gif' => 'gif',
        'ps'  => 'postscript eps enhanced color',
        'pdf' => 'pdf color enhanced',
    );
    my $gnuplot_terminal = undef;
    if ($output and $output =~ /\.(\w+)$/) {
        $gnuplot_terminal = $1;
        die "The format '$gnuplot_terminal' is not currently supported." if not exists $terminal_mapping{$gnuplot_terminal};
        require Chart::Gnuplot;
103

104
    }
105
106


107
    my $dbh = $hive_dba->dbc->db_handle();
108

109
110
    # Get the memory usage from each resource_class
    my %mem_resources = ();
111
    my %cpu_resources = ();
112
113
114
115
116
117
    {
        my $sql_resource_descriptions = 'SELECT resource_class_id, meadow_type, submission_cmd_args FROM resource_description';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_resource_descriptions)}) {
            my ($resource_class_id, $meadow_type, $submission_cmd_args) = @$db_entry;
            if ($meadow_type eq 'LSF') {
                $mem_resources{$resource_class_id} = $1 if $submission_cmd_args =~ m/mem=(\d+)/;
118
                $cpu_resources{$resource_class_id} = $1 if $submission_cmd_args =~ m/-n\s*(\d+)/;
119
120
121
            }
        }
    }
Matthieu Muffato's avatar
Matthieu Muffato committed
122
123
    warn "mem_resources: ", Dumper \%mem_resources if $verbose;
    warn "cpu_resources: ", Dumper \%cpu_resources if $verbose;
124
125
126
127
128
129
130
131
132

    # Get the memory used by each worker
    my %used_mem = ();
    if ($mode eq 'wasted_memory') {
        my $sql_used_mem = 'SELECT meadow_name, process_id, mem_megs FROM lsf_report';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_used_mem)}) {
            my ($meadow_name, $process_id, $mem_megs) = @$db_entry;
            $used_mem{$meadow_name."_____".$process_id} = $mem_megs;
        }
Matthieu Muffato's avatar
Matthieu Muffato committed
133
        warn scalar(keys %used_mem), " process info loaded from lsf_report\n" if $verbose;
134
    }
135
136
137
138
139
140
141
142
143
144
145
146

    # Get the info about the analysis
    my %default_resource_class = ();
    my %analysis_name = ();
    {
        my $sql_analysis_info = 'SELECT analysis_id, logic_name, resource_class_id FROM analysis_base';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_analysis_info)}) {
            my ($analysis_id, $logic_name, $resource_class_id) = @$db_entry;
            $analysis_name{$analysis_id} = $logic_name;
            $default_resource_class{$analysis_id} = $resource_class_id;
        }
    }
Matthieu Muffato's avatar
Matthieu Muffato committed
147
148
149
    warn "default_resource_class: ", Dumper \%default_resource_class if $verbose;
    warn "analysis_name: ", Dumper \%analysis_name if $verbose;
    warn scalar(keys %analysis_name), " analysis\n" if $verbose;
150

151
152
153
    # Get the events from the database
    my %events = ();
    {
154
155
        my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id, meadow_name, process_id, resource_class_id, 1 FROM worker WHERE analysis_id IS NOT NULL')};
        push @tmp_dates, @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, meadow_name, process_id, resource_class_id, -1 FROM worker WHERE analysis_id IS NOT NULL')};
Matthieu Muffato's avatar
Matthieu Muffato committed
156
        warn scalar(@tmp_dates), " events\n" if $verbose;
157

158
        foreach my $db_entry (@tmp_dates) {
159
            my ($event_date, $analysis_id, $meadow_name, $process_id, $resource_class_id, $offset) = @$db_entry;
160
161
162
            $resource_class_id = $default_resource_class{$analysis_id} unless $resource_class_id;
            if ($mode eq 'workers') {
                $events{$event_date}{$analysis_id} += $offset;
163
            } elsif ($mode eq 'memory') {
164
                $events{$event_date}{$analysis_id} += $offset * ($mem_resources{$resource_class_id} || $default_memory) / 1024.;
165
            } elsif ($mode eq 'cores') {
166
                $events{$event_date}{$analysis_id} += $offset * ($cpu_resources{$resource_class_id} || $default_cores);
167
168
169
170
171
172
            } else {
                my $process_signature = $meadow_name."_____".$process_id;
                if (exists $used_mem{$process_signature}) {
                    my $wasted_memory = ($mem_resources{$resource_class_id} || $default_memory) - $used_mem{$process_signature};
                    $events{$event_date}{$analysis_id} += $offset * $wasted_memory / 1024. if $wasted_memory > 0;
                }
173
            }
174
175
176
        }
    }
    my @event_dates = sort {$a cmp $b} (keys %events);
Matthieu Muffato's avatar
Matthieu Muffato committed
177
    warn scalar(@event_dates), " dates\n" if $verbose;
178
179
180
181
182

    my $max_workers = 0;
    my @data_timings = ();
    my %tot_analysis = ();

183
    my $num_curr_workers = 0;
184
    my %hash_curr_workers = (map {$_ => 0 } (keys %analysis_name));
185
186

    foreach my $event_date (@event_dates) {
187

188
        last if $end_date and ($event_date gt $end_date);
189

190
191
192
193
194
        my $topup_hash = $events{$event_date};
        foreach my $analysis_id (keys %$topup_hash) {
            $hash_curr_workers{$analysis_id} += $topup_hash->{$analysis_id};
            $num_curr_workers += $topup_hash->{$analysis_id};
        }
195
        #die sum(values %hash_curr_workers)."!=$num_curr_workers" if sum(values %hash_curr_workers) != $num_curr_workers;
196
197
198
199
200

        next if $start_date and ($event_date lt $start_date);

        my %hash_interval = %hash_curr_workers;
        #FIXME It should be normalised by the length of the time interval
201
202
        map {$tot_analysis{$_} += $hash_interval{$_}} keys %hash_interval;

203
        $max_workers = $num_curr_workers if ($num_curr_workers > $max_workers);
204

205
206
        # We need to repeat the previous value to have an histogram shape
        push @data_timings, [$event_date, $data_timings[-1]->[1]] if @data_timings;
207
        push @data_timings, [$event_date, \%hash_interval];
208
    }
Matthieu Muffato's avatar
Matthieu Muffato committed
209
210
    warn $max_workers if $verbose;
    warn Dumper \%tot_analysis if $verbose;
211

212
213
    my $total_total = sum(values %tot_analysis);

214
    my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $analysis_name{$a} cmp lc $analysis_name{$b})} (grep {$tot_analysis{$_}} keys %tot_analysis);
Matthieu Muffato's avatar
Matthieu Muffato committed
215
216
    warn Dumper \@sorted_analysis_ids if $verbose;
    warn Dumper([map {$analysis_name{$_}} @sorted_analysis_ids]) if $verbose;
217

218
    if (not $gnuplot_terminal) {
219
        print join("\t", 'date', "OVERALL_$mode", map {$analysis_name{$_}} @sorted_analysis_ids), "\n";
220
        print join("\t", 'total', $total_total, map {$tot_analysis{$_}} @sorted_analysis_ids), "\n";
221
        print join("\t", 'proportion', 'NA', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
222
        my $s = 0;
223
        print join("\t", 'cum_proportion', 'NA', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
224
225

        foreach my $row (@data_timings) {
226
            print join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_}} @sorted_analysis_ids)."\n";
227
        }
228
        return;
229
230
    }

231
    # Get the number of analysis we want to display
232
233
    my $n_relevant_analysis = scalar(@sorted_analysis_ids);
    if ($top and ($top > 0)) {
234
235
        if ($top < 1) {
            my $s = 0;
236
237
238
239
            $n_relevant_analysis = 0;
            map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < $top && $n_relevant_analysis++} @sorted_analysis_ids;
        } elsif ($top < scalar(@sorted_analysis_ids)) {
            $n_relevant_analysis = $top;
240
241
        }
    }
242
243
244
245
246
247
248
    # cap based on the length of the palette
    my $need_other_analysis = $n_relevant_analysis < scalar(@sorted_analysis_ids) ? 1 : 0;
    if (($n_relevant_analysis+$need_other_analysis) > scalar(@palette)) {
        $n_relevant_analysis = scalar(@palette) - 1;
        $need_other_analysis = 1;
    }
    $top = $n_relevant_analysis unless $top;
Matthieu Muffato's avatar
Matthieu Muffato committed
249
    warn "$n_relevant_analysis relevant analysis\n" if $verbose;
250
251
252
253
254

    my @xdata = map {$_->[0]} @data_timings;

    my @datasets = ();

255
256
257
258
    my $pseudo_zero_value = $logscale ? .8 : -$max_workers / 50;

    # The background plot: the sum of all the analysis
    if ($need_other_analysis) {
259
260
        my @ydata = ();
        foreach my $row (@data_timings) {
261
            push @ydata, sum(map {$row->[1]->{$_}} @sorted_analysis_ids ) || $pseudo_zero_value;
262
263
264
265
266
267
        }
        push @datasets, Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
            title => 'OTHER',
268
            style => 'filledcurves x1',
269
270
271
            linewidth => '0',
            color => $palette[$n_relevant_analysis],
        );
272
    }
273

274
    # Each analysis is plotted as the sum of itself and the top ones
275
    foreach my $i (reverse 1..$n_relevant_analysis) {
276
277
        my @ydata;
        foreach my $row (@data_timings) {
278
            push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] ) || $pseudo_zero_value;
279
280
281
282
283
        }
        my $dataset = Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
284
            title => $analysis_name{$sorted_analysis_ids[$i-1]},
285
            style => 'filledcurves x1',
286
287
288
289
            linewidth => '0',
            color => $palette[$i-1],
        );
        push @datasets, $dataset;
290
    }
291
292

    my $chart = Chart::Gnuplot->new(
293
        title => sprintf('Profile of %s', $n_relevant_analysis < scalar(@sorted_analysis_ids) ? ($top < 1 ? sprintf('%.1f%% of %s', 100*$top, $url) : "the $top top-analysis of $url") : $url).($start_date ? " from $start_date" : "").($end_date ? " to $end_date" : ""),
294
295
296
297
298
299
        timeaxis => 'x',
        legend => {
            position => 'outside right',
            align => 'left',
        },
        xtics => {
300
            labelfmt => '%b %d\n %H:00',
301
302
303
304
305
306
307
        },
        bg => {
            color => 'white',
        },
        imagesize => '1400, 800',
        output => $output,
        terminal => $terminal_mapping{$gnuplot_terminal},
308
        ylabel => $allowed_modes{$mode},
309
310
        yrange => [$pseudo_zero_value, undef],
        $logscale ? (logscale => 'y') : (),
311
312
    );
    $chart->plot2d(@datasets);
313
314
315
316
317
318
319
320
321
322
323

}



__DATA__

=pod

=head1 NAME

Matthieu Muffato's avatar
Matthieu Muffato committed
324
    generate_timeline.pl
325

Matthieu Muffato's avatar
Matthieu Muffato committed
326
327
328
329
330
331
332
333
=head1 SYNOPSIS

    generate_timeline.pl {-url <url> | [-reg_conf <reg_conf>] -reg_alias <reg_alias> [-reg_type <reg_type>] }
                         [-start_date <start_date>] [-end_date <end_date>]
                         [-top <float>]
                         [-mode [workers | memory | cores | wasted_memory]]
                         [-n_core <int>] [-mem <int>]

334
335
=head1 DESCRIPTION

Matthieu Muffato's avatar
Matthieu Muffato committed
336
    This script is used for offline examination of the allocation of workers.
337
338
339

    Based on the command-line parameters 'start_date' and 'end_date', or on the start time of the first
    worker and end time of the last worker (as recorded in pipeline DB), it pulls the relevant data out
Matthieu Muffato's avatar
Matthieu Muffato committed
340
    of the 'worker' table for accurate timing.
341
    By default, the output is in CSV format, to allow extra analysis to be carried.
342

Matthieu Muffato's avatar
Matthieu Muffato committed
343
    You can optionally ask the script to generate an image with Gnuplot.
344
345
346
347


=head1 USAGE EXAMPLES

348
        # Just run it the usual way: only the top 20 analysis will be reported in CSV format
Matthieu Muffato's avatar
Matthieu Muffato committed
349
    generate_timeline.pl -url mysql://username:secret@hostname:port/database > timeline.csv
350

Matthieu Muffato's avatar
Matthieu Muffato committed
351
        # The same, but getting the analysis that fill 99.5% of the global activity in a PNG file
Matthieu Muffato's avatar
Matthieu Muffato committed
352
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -top .995 -output timeline_top995.png
353

Matthieu Muffato's avatar
Matthieu Muffato committed
354
        # Assuming you are only interested in a precise interval (in a PNG file)
Matthieu Muffato's avatar
Matthieu Muffato committed
355
356
357
358
359
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -output timeline_June15.png

        # Get the required memory instead of the number of workers
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -mode memory -output timeline_memory.png

360
361
362
363
364

=head1 OPTIONS

    -help                   : print this help
    -url <url string>       : url defining where hive database is located
Matthieu Muffato's avatar
Matthieu Muffato committed
365
366
367
368
369
    -reg_cong, -reg_type, -reg_alias    : alternative connection details
    -nosqlvc                : Do not restrict the usage of this script to the current version of eHive
                              Be aware that generate_timeline.pl uses raw SQL queries that may break on different schema versions
    -verbose                : Print some info about the data loaded from the database

Matthieu Muffato's avatar
Matthieu Muffato committed
370
371
    -start_date <date>      : minimal start date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
    -end_date <date>        : maximal end date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
372
    -top <float>            : maximum number (> 1) or fraction (< 1) of analysis to report (default: 20)
Matthieu Muffato's avatar
Matthieu Muffato committed
373
    -output <string>        : output file: its extension must match one of the Gnuplot terminals. Otherwise, the CSV output is produced on stdout
Matthieu Muffato's avatar
Matthieu Muffato committed
374
375
376
377
    -mode <string>          : what should be displayed on the y-axis. Allowed values are 'workers' (default), 'memory', 'cores', 'wasted_memory'

    -n_core <int>           : the default number of cores allocated to a worker (default: 1)
    -mem <int>              : the default memory allocated to a worker (default: 100Mb)
378
379
380
381
382
383
384

=head1 CONTACT

    Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.

=cut