generate_timeline.pl 18.5 KB
Newer Older
1 2
#!/usr/bin/env perl

Matthieu Muffato's avatar
Matthieu Muffato committed
3
# Gets the activity of each analysis along time, in a CSV file or in an image (see list of formats supported by GNUplot)
4 5 6 7 8 9 10 11

use strict;
use warnings;

    # Finding out own path in order to reference own components (including own modules):
use Cwd            ();
use File::Basename ();
BEGIN {
12
    $ENV{'EHIVE_ROOT_DIR'} ||= File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) );
13 14 15 16 17
    unshift @INC, $ENV{'EHIVE_ROOT_DIR'}.'/modules';
}


use Getopt::Long;
18
use List::Util qw(sum);
19
use POSIX;
20
use Data::Dumper;
21 22 23 24

use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::Utils ('script_usage');

25 26
no warnings qw{qw};

27 28 29 30 31
main();
exit(0);

sub main {

32
    my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc, $help, $verbose, $mode, $start_date, $end_date, $output, $top, $default_memory, $default_cores);
33 34 35 36 37 38 39 40 41

    GetOptions(
                # connect to the database:
            'url=s'                      => \$url,
            'reg_conf|regfile=s'         => \$reg_conf,
            'reg_type=s'                 => \$reg_type,
            'reg_alias|regname=s'        => \$reg_alias,
            'nosqlvc=i'                  => \$nosqlvc,      # using "=i" instead of "!" for consistency with scripts where it is a propagated option

Matthieu Muffato's avatar
Matthieu Muffato committed
42 43 44
            'verbose!'                   => \$verbose,
            'h|help'                     => \$help,

45 46
            'start_date=s'               => \$start_date,
            'end_date=s'                 => \$end_date,
47
            'mode=s'                     => \$mode,
48
            'top=f'                      => \$top,
49
            'mem=i'                      => \$default_memory,
Matthieu Muffato's avatar
Matthieu Muffato committed
50
            'n_core=i'                   => \$default_cores,
51
            'output=s'                   => \$output,
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
    );

    if ($help) { script_usage(0); }

    my $hive_dba;
    if($url or $reg_alias) {
        $hive_dba = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new(
                -url                            => $url,
                -reg_conf                       => $reg_conf,
                -reg_type                       => $reg_type,
                -reg_alias                      => $reg_alias,
                -no_sql_schema_version_check    => $nosqlvc,
        );
    } else {
        warn "\nERROR: Connection parameters (url or reg_conf+reg_alias) need to be specified\n";
        script_usage(1);
    }

70 71 72 73
    # Check whether $mode is valid
    my %allowed_modes = (
        workers => 'Number of workers',
        memory => 'Memory asked (Gb)',
74
        cores => 'Number of CPU cores',
75
        unused_memory => 'Unused memory (Gb)',
76
        unused_cores => 'Number of unused CPU cores',
77
        pending_workers => 'Number of pending workers',
78 79 80 81
    );
    if ($mode) {
        die "Unknown mode '$mode'. Allowed modes are: ".join(", ", keys %allowed_modes) unless exists $allowed_modes{$mode};
        $default_memory = 100 unless $default_memory;
82
        $default_cores = 1 unless $default_cores;
83 84 85 86
    } else {
        $mode = 'workers';
    }

87
    # Palette generated with R: c(brewer.pal(9, "Set1"), brewer.pal(12, "Set3")). #FFFFB3 is removed because it is too close to white
88
    my @palette = qw(#E41A1C #377EB8 #4DAF4A #984EA3 #FF7F00 #FFFF33 #A65628 #F781BF #999999     #8DD3C7 #BEBADA #FB8072 #80B1D3 #FDB462 #B3DE69 #FCCDE5 #D9D9D9 #BC80BD #CCEBC5 #FFED6F    #2F4F4F);
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

    my %terminal_mapping = (
        'emf' => 'emf',
        'png' => 'png',
        'svg' => 'svg',
        'jpg' => 'jpeg',
        'gif' => 'gif',
        'ps'  => 'postscript eps enhanced color',
        'pdf' => 'pdf color enhanced',
    );
    my $gnuplot_terminal = undef;
    if ($output and $output =~ /\.(\w+)$/) {
        $gnuplot_terminal = $1;
        die "The format '$gnuplot_terminal' is not currently supported." if not exists $terminal_mapping{$gnuplot_terminal};
        require Chart::Gnuplot;
104

105
    }
106 107


108
    my $dbh = $hive_dba->dbc->db_handle();
109

110 111
    # Get the memory usage from each resource_class
    my %mem_resources = ();
112
    my %cpu_resources = ();
113 114 115 116 117 118
    {
        my $sql_resource_descriptions = 'SELECT resource_class_id, meadow_type, submission_cmd_args FROM resource_description';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_resource_descriptions)}) {
            my ($resource_class_id, $meadow_type, $submission_cmd_args) = @$db_entry;
            if ($meadow_type eq 'LSF') {
                $mem_resources{$resource_class_id} = $1 if $submission_cmd_args =~ m/mem=(\d+)/;
119
                $cpu_resources{$resource_class_id} = $1 if $submission_cmd_args =~ m/-n\s*(\d+)/;
120 121 122
            }
        }
    }
Matthieu Muffato's avatar
Matthieu Muffato committed
123 124
    warn "mem_resources: ", Dumper \%mem_resources if $verbose;
    warn "cpu_resources: ", Dumper \%cpu_resources if $verbose;
125 126

    # Get the memory used by each worker
127 128 129 130 131 132
    my %used_res = ();
    if (($mode eq 'unused_memory') or ($mode eq 'unused_cores')) {
        my $sql_used_res = 'SELECT meadow_name, process_id, mem_megs, cpu_sec/lifespan_sec FROM lsf_report';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_used_res)}) {
            my ($meadow_name, $process_id, $mem_megs, $cpu_usage) = @$db_entry;
            $used_res{$meadow_name."_____".$process_id} = [$mem_megs, $cpu_usage];
133
        }
134
        warn scalar(keys %used_res), " process info loaded from lsf_report\n" if $verbose;
135
    }
136 137 138 139 140 141 142 143 144 145 146 147

    # Get the info about the analysis
    my %default_resource_class = ();
    my %analysis_name = ();
    {
        my $sql_analysis_info = 'SELECT analysis_id, logic_name, resource_class_id FROM analysis_base';
        foreach my $db_entry (@{$dbh->selectall_arrayref($sql_analysis_info)}) {
            my ($analysis_id, $logic_name, $resource_class_id) = @$db_entry;
            $analysis_name{$analysis_id} = $logic_name;
            $default_resource_class{$analysis_id} = $resource_class_id;
        }
    }
Matthieu Muffato's avatar
Matthieu Muffato committed
148 149 150
    warn "default_resource_class: ", Dumper \%default_resource_class if $verbose;
    warn "analysis_name: ", Dumper \%analysis_name if $verbose;
    warn scalar(keys %analysis_name), " analysis\n" if $verbose;
151

152 153
    # Get the events from the database
    my %events = ();
154
    if ($mode ne 'pending_workers') {
155
        my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(born, "%Y-%m-%dT%T"), DATE_FORMAT(died, "%Y-%m-%dT%T"), analysis_id, meadow_name, process_id, resource_class_id FROM worker WHERE analysis_id IS NOT NULL')};
Matthieu Muffato's avatar
Matthieu Muffato committed
156
        warn scalar(@tmp_dates), " events\n" if $verbose;
157

158
        foreach my $db_entry (@tmp_dates) {
159
            my ($birth_date, $death_date, $analysis_id, $meadow_name, $process_id, $resource_class_id) = @$db_entry;
160
            $resource_class_id = $default_resource_class{$analysis_id} unless $resource_class_id;
161 162
            my $offset = 0;

163
            if ($mode eq 'workers') {
164
                $offset = 1;
165
            } elsif ($mode eq 'memory') {
166
                $offset = ($mem_resources{$resource_class_id} || $default_memory) / 1024.;
167
            } elsif ($mode eq 'cores') {
168
                $offset = ($cpu_resources{$resource_class_id} || $default_cores);
169 170 171
            } elsif ($mode eq 'unused_memory') {
                my $process_signature = $meadow_name."_____".$process_id;
                if (exists $used_res{$process_signature}) {
172
                    $offset = (($mem_resources{$resource_class_id} || $default_memory) - $used_res{$process_signature}->[0]) / 1024.;
173
                }
174 175
            } else {
                my $process_signature = $meadow_name."_____".$process_id;
176
                if (exists $used_res{$process_signature}) {
177
                    $offset = ($cpu_resources{$resource_class_id} || $default_cores) - $used_res{$process_signature}->[1];
178
                }
179
            }
180
            $events{$birth_date}{$analysis_id} += $offset if $offset > 0;
181
            $events{$death_date}{$analysis_id} -= $offset if ($offset > 0) and $death_date;
182
        }
183 184 185 186 187 188 189 190 191
    } else {
        my @tmp_dates = @{$dbh->selectall_arrayref('SELECT DATE_FORMAT(DATE_SUB(born, INTERVAL pending_sec SECOND), "%Y-%m-%dT%T"), DATE_FORMAT(born, "%Y-%m-%dT%T"), analysis_id FROM worker JOIN lsf_report USING (meadow_name, process_id) WHERE analysis_id IS NOT NULL AND meadow_type = "LSF" AND pending_sec > 0')};
        warn scalar(@tmp_dates), " events\n" if $verbose;

        foreach my $db_entry (@tmp_dates) {
            my ($start_pending, $start_running, $analysis_id) = @$db_entry;
            $events{$start_pending}{$analysis_id} += 1;
            $events{$start_running}{$analysis_id} -= 1;
        }
192 193
    }
    my @event_dates = sort {$a cmp $b} (keys %events);
Matthieu Muffato's avatar
Matthieu Muffato committed
194
    warn scalar(@event_dates), " dates\n" if $verbose;
195 196 197 198 199

    my $max_workers = 0;
    my @data_timings = ();
    my %tot_analysis = ();

200
    my $num_curr_workers = 0;
201
    my %hash_curr_workers = (map {$_ => 0 } (keys %analysis_name));
202 203

    foreach my $event_date (@event_dates) {
204

205
        last if $end_date and ($event_date gt $end_date);
206

207 208 209 210 211
        my $topup_hash = $events{$event_date};
        foreach my $analysis_id (keys %$topup_hash) {
            $hash_curr_workers{$analysis_id} += $topup_hash->{$analysis_id};
            $num_curr_workers += $topup_hash->{$analysis_id};
        }
212 213
        # Due to rounding errors, the sums may be slightly different
        die sum(values %hash_curr_workers)."!=$num_curr_workers" if abs(sum(values %hash_curr_workers) - $num_curr_workers) > 0.05;
214 215 216 217 218

        next if $start_date and ($event_date lt $start_date);

        my %hash_interval = %hash_curr_workers;
        #FIXME It should be normalised by the length of the time interval
219 220
        map {$tot_analysis{$_} += $hash_interval{$_}} keys %hash_interval;

221
        $max_workers = $num_curr_workers if ($num_curr_workers > $max_workers);
222

223 224
        # We need to repeat the previous value to have an histogram shape
        push @data_timings, [$event_date, $data_timings[-1]->[1]] if @data_timings;
225
        push @data_timings, [$event_date, \%hash_interval];
226
    }
Matthieu Muffato's avatar
Matthieu Muffato committed
227 228
    warn $max_workers if $verbose;
    warn Dumper \%tot_analysis if $verbose;
229

230 231
    my $total_total = sum(values %tot_analysis);

232
    my @sorted_analysis_ids = sort {($tot_analysis{$b} <=> $tot_analysis{$a}) || (lc $analysis_name{$a} cmp lc $analysis_name{$b})} (grep {$tot_analysis{$_}} keys %tot_analysis);
Matthieu Muffato's avatar
Matthieu Muffato committed
233 234
    warn Dumper \@sorted_analysis_ids if $verbose;
    warn Dumper([map {$analysis_name{$_}} @sorted_analysis_ids]) if $verbose;
235

236
    if (not $gnuplot_terminal) {
237
        print join("\t", 'date', "OVERALL_$mode", map {$analysis_name{$_}} @sorted_analysis_ids), "\n";
238
        print join("\t", 'total', $total_total, map {$tot_analysis{$_}} @sorted_analysis_ids), "\n";
239
        print join("\t", 'proportion', 'NA', map {$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
240
        my $s = 0;
241
        print join("\t", 'cum_proportion', 'NA', map {$s+=$tot_analysis{$_}/$total_total} @sorted_analysis_ids), "\n";
242 243

        foreach my $row (@data_timings) {
244
            print join("\t", $row->[0], sum(values %{$row->[1]}), map {$row->[1]->{$_}} @sorted_analysis_ids)."\n";
245
        }
246
        return;
247 248
    }

249
    # Get the number of analysis we want to display
250 251
    my $n_relevant_analysis = scalar(@sorted_analysis_ids);
    if ($top and ($top > 0)) {
252 253
        if ($top < 1) {
            my $s = 0;
254 255 256 257
            $n_relevant_analysis = 0;
            map {my $pre_s = $s; $s += $tot_analysis{$_}/$total_total; $pre_s < $top && $n_relevant_analysis++} @sorted_analysis_ids;
        } elsif ($top < scalar(@sorted_analysis_ids)) {
            $n_relevant_analysis = $top;
258 259
        }
    }
260 261 262 263 264 265 266
    # cap based on the length of the palette
    my $need_other_analysis = $n_relevant_analysis < scalar(@sorted_analysis_ids) ? 1 : 0;
    if (($n_relevant_analysis+$need_other_analysis) > scalar(@palette)) {
        $n_relevant_analysis = scalar(@palette) - 1;
        $need_other_analysis = 1;
    }
    $top = $n_relevant_analysis unless $top;
Matthieu Muffato's avatar
Matthieu Muffato committed
267
    warn "$n_relevant_analysis relevant analysis\n" if $verbose;
268 269 270 271 272

    my @xdata = map {$_->[0]} @data_timings;

    my @datasets = ();

273
    my $pseudo_zero_value = -$max_workers / 50;
274 275 276

    # The background plot: the sum of all the analysis
    if ($need_other_analysis) {
277 278
        my @ydata = ();
        foreach my $row (@data_timings) {
279
            push @ydata, sum(map {$row->[1]->{$_}} @sorted_analysis_ids ) || $pseudo_zero_value;
280 281
            # Due to rounding errors, values are not always decreased to 0
            $ydata[-1] = $pseudo_zero_value if $ydata[-1] < 0.05;
282 283 284 285 286 287
        }
        push @datasets, Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
            title => 'OTHER',
288
            style => 'filledcurves x1',
289 290 291
            linewidth => '0',
            color => $palette[$n_relevant_analysis],
        );
292
    }
293

294
    # Each analysis is plotted as the sum of itself and the top ones
295
    foreach my $i (reverse 1..$n_relevant_analysis) {
296 297
        my @ydata;
        foreach my $row (@data_timings) {
298
            push @ydata, sum(map {$row->[1]->{$_} || 0} @sorted_analysis_ids[0..($i-1)] ) || $pseudo_zero_value;
299 300
            # Due to rounding errors, values are not always decreased to 0
            $ydata[-1] = $pseudo_zero_value if $ydata[-1] < 0.05;
301 302 303 304 305
        }
        my $dataset = Chart::Gnuplot::DataSet->new(
            xdata => \@xdata,
            ydata => \@ydata,
            timefmt => '%Y-%m-%dT%H:%M:%S',
306
            title => $analysis_name{$sorted_analysis_ids[$i-1]},
307
            style => 'filledcurves x1',
308 309 310 311
            linewidth => '0',
            color => $palette[$i-1],
        );
        push @datasets, $dataset;
312
    }
313

314 315 316 317 318 319 320 321 322 323 324 325 326
    my $safe_database_location = sprintf('%s@%s', $hive_dba->dbc->dbname, $hive_dba->dbc->host || '-');
    my $plotted_analyses_desc = '';
    if ($n_relevant_analysis < scalar(@sorted_analysis_ids)) {
        if ($top < 1) {
            $plotted_analyses_desc = sprintf('the top %.1f%% of ', 100*$top);
        } else {
            $plotted_analyses_desc = "the top $top analyses of ";
        }
    }
    my $title = "Profile of ${plotted_analyses_desc}${safe_database_location}";
    $title .= " from $start_date" if $start_date;
    $title .= " to $end_date" if $end_date;

327
    my $chart = Chart::Gnuplot->new(
328
        title => $title,
329 330 331 332 333 334
        timeaxis => 'x',
        legend => {
            position => 'outside right',
            align => 'left',
        },
        xtics => {
335
            labelfmt => '%b %d\n %H:%M',
336
            along => 'out nomirror',
337 338 339 340
        },
        bg => {
            color => 'white',
        },
341
        grid => 'on',
342 343 344
        imagesize => '1400, 800',
        output => $output,
        terminal => $terminal_mapping{$gnuplot_terminal},
345
        ylabel => $allowed_modes{$mode},
346
        yrange => [$pseudo_zero_value, undef],
347 348
    );
    $chart->plot2d(@datasets);
349 350 351 352 353 354 355 356 357 358 359

}



__DATA__

=pod

=head1 NAME

Matthieu Muffato's avatar
Matthieu Muffato committed
360
    generate_timeline.pl
361

Matthieu Muffato's avatar
Matthieu Muffato committed
362 363 364 365 366
=head1 SYNOPSIS

    generate_timeline.pl {-url <url> | [-reg_conf <reg_conf>] -reg_alias <reg_alias> [-reg_type <reg_type>] }
                         [-start_date <start_date>] [-end_date <end_date>]
                         [-top <float>]
367
                         [-mode [workers | memory | cores | unused_memory | unused_cores | pending_workers]]
Matthieu Muffato's avatar
Matthieu Muffato committed
368 369
                         [-n_core <int>] [-mem <int>]

370 371
=head1 DESCRIPTION

Matthieu Muffato's avatar
Matthieu Muffato committed
372
    This script is used for offline examination of the allocation of workers.
373 374 375

    Based on the command-line parameters 'start_date' and 'end_date', or on the start time of the first
    worker and end time of the last worker (as recorded in pipeline DB), it pulls the relevant data out
Matthieu Muffato's avatar
Matthieu Muffato committed
376
    of the 'worker' table for accurate timing.
377
    By default, the output is in CSV format, to allow extra analysis to be carried.
378

Matthieu Muffato's avatar
Matthieu Muffato committed
379
    You can optionally ask the script to generate an image with Gnuplot.
380 381 382 383


=head1 USAGE EXAMPLES

384
        # Just run it the usual way: only the top 20 analysis will be reported in CSV format
Matthieu Muffato's avatar
Matthieu Muffato committed
385
    generate_timeline.pl -url mysql://username:secret@hostname:port/database > timeline.csv
386

Matthieu Muffato's avatar
Matthieu Muffato committed
387
        # The same, but getting the analysis that fill 99.5% of the global activity in a PNG file
Matthieu Muffato's avatar
Matthieu Muffato committed
388
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -top .995 -output timeline_top995.png
389

Matthieu Muffato's avatar
Matthieu Muffato committed
390
        # Assuming you are only interested in a precise interval (in a PNG file)
Matthieu Muffato's avatar
Matthieu Muffato committed
391 392 393 394 395
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -start_date 2013-06-15T10:34 -end_date 2013-06-15T16:58 -output timeline_June15.png

        # Get the required memory instead of the number of workers
    generate_timeline.pl -url mysql://username:secret@hostname:port/database -mode memory -output timeline_memory.png

396 397 398 399 400

=head1 OPTIONS

    -help                   : print this help
    -url <url string>       : url defining where hive database is located
Matthieu Muffato's avatar
Matthieu Muffato committed
401 402 403 404 405
    -reg_cong, -reg_type, -reg_alias    : alternative connection details
    -nosqlvc                : Do not restrict the usage of this script to the current version of eHive
                              Be aware that generate_timeline.pl uses raw SQL queries that may break on different schema versions
    -verbose                : Print some info about the data loaded from the database

Matthieu Muffato's avatar
Matthieu Muffato committed
406 407
    -start_date <date>      : minimal start date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
    -end_date <date>        : maximal end date of a worker (the format is ISO8601, e.g. '2012-01-25T13:46')
408
    -top <float>            : maximum number (> 1) or fraction (< 1) of analysis to report (default: 20)
Matthieu Muffato's avatar
Matthieu Muffato committed
409
    -output <string>        : output file: its extension must match one of the Gnuplot terminals. Otherwise, the CSV output is produced on stdout
410
    -mode <string>          : what should be displayed on the y-axis. Allowed values are 'workers' (default), 'memory', 'cores', 'unused_memory', 'unused_cores', 'pending_workers'
Matthieu Muffato's avatar
Matthieu Muffato committed
411 412 413

    -n_core <int>           : the default number of cores allocated to a worker (default: 1)
    -mem <int>              : the default memory allocated to a worker (default: 100Mb)
414

415 416 417 418 419 420
=head1 EXTERNAL DEPENDENCIES

    Chart::Gnuplot

=head1 LICENSE

421
    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Matthieu Muffato's avatar
Matthieu Muffato committed
422
    Copyright [2016-2018] EMBL-European Bioinformatics Institute
423 424 425 426 427 428 429 430 431 432

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.

433 434
=head1 CONTACT

435
    Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
436 437 438

=cut