Graph.pm 20.2 KB
Newer Older
1
=pod
2
3
4

=head1 NAME

5
    Bio::EnsEMBL::Hive::Utils::Graph
6
7
8

=head1 SYNOPSIS

9
10
11
12
    my $dba = get_hive_dba();
    my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $dba);
    my $graphviz = $g->build();
    $graphviz->as_png('location.png');
13
14
15

=head1 DESCRIPTION

16
17
18
19
20
21
22
    This is a module for converting a hive database's flow of analyses, control 
    rules and dataflows into the GraphViz model language. This information can
    then be converted to an image or to the dot language for further manipulation
    in GraphViz.

=head1 LICENSE

23
    Copyright [1999-2016] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
24
25
26
27
28
29
30
31
32

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.
33

34
=head1 CONTACT
35

36
  Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
37
38
39
40
41

=head1 APPENDIX

    The rest of the documentation details each of the object methods.
    Internal methods are usually preceded with a _
42
43
44

=cut

45
46
47

package Bio::EnsEMBL::Hive::Utils::Graph;

48
49
50
use strict;
use warnings;

51
use Bio::EnsEMBL::Hive::Utils::GraphViz;
52
use Bio::EnsEMBL::Hive::Utils::Config;
53

54
55
use base ('Bio::EnsEMBL::Hive::Configurable');

56
57
58

=head2 new()

59
60
61
62
63
64
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $dba;
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
65
66
67
68
69
70
71
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
72
  my ($class, $dba, $config_file_name) = @_;
73

74
  my $self = bless({}, ref($class) || $class);
75

76
  $self->dba($dba);
77
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
78
  $self->config($config);
79
  $self->context( [ 'Graph' ] );
80

81
82
83
84
85
86
87
88
89
90
91
92
93
94
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
95
96
97
98
99
100
101
    my ($self) = @_;

    if(! exists $self->{graph}) {
        my $padding  = $self->config_get('Pad') || 0;
        $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => qq{compress"; pad = "$padding}  ); # injection hack!
    }
    return $self->{graph};
102
103
104
105
106
107
108
109
110
111
112
113
114
}


=head2 dba()

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

sub dba {
115
116
117
118
119
120
121
    my $self = shift @_;

    if(@_) {
        $self->{dba} = shift @_;
    }

    return $self->{dba};
122
123
124
}


125
126
127
128
129
130
sub _analysis_node_name {
    my $analysis_id = shift @_;

    return 'analysis_' . $analysis_id;
}

131
132
133
134
135
136
sub _table_node_name {
    my $table_name = shift @_;

    return 'table_' . $table_name;
}

137

138
139
140
141
142
143
sub _midpoint_name {
    my $rule_id = shift @_;

    return 'dfr_'.$rule_id.'_mp';
}

144
145
146
147
148
149
150
151
152
153
154

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
155
    my ($self) = @_;
156
157
158
159
160
161
162

    my $all_analyses          = $self->dba()->get_AnalysisAdaptor()->fetch_all();
    my $all_ctrl_rules        = $self->dba()->get_AnalysisCtrlRuleAdaptor()->fetch_all();
    my $all_dataflow_rules    = $self->dba()->get_DataflowRuleAdaptor()->fetch_all();

    my %inflow_count = ();    # used to detect sources (nodes with zero inflow)
    my %outflow_rules = ();   # maps from anlaysis_node_name to a list of all dataflow rules that flow out of it
163
    my %dfr_flows_into_node = ();   # maps from dfr_id to target analysis_node_name
164
165

    foreach my $rule ( @$all_dataflow_rules ) {
166
167
168
        my $target_object = $rule->to_analysis;
        if(my $to_id = $target_object->can('dbID') && $target_object->dbID()) {
            my $to_node_name = _analysis_node_name( $to_id );
169
            $inflow_count{$to_node_name}++;
170
            $dfr_flows_into_node{$rule->dbID()} = $to_node_name;
171
172
173
174
175
176
177
178
        }
        push @{$outflow_rules{ _analysis_node_name($rule->from_analysis_id()) }}, $rule;
    }

    my %subgraph_allocation = ();

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
179
180
181
    foreach my $source_analysis_node_name ( map { _analysis_node_name( $_->dbID ) } @$all_analyses ) {
        unless($inflow_count{$source_analysis_node_name}) {    # if there is no dataflow into this analysis
            $self->_allocate_to_subgraph(\%outflow_rules, \%dfr_flows_into_node, $source_analysis_node_name, \%subgraph_allocation ); # run the recursion in each component that has a non-cyclic start
182
183
184
185
186
187
188
189
        }
    }

    $self->_add_hive_details();
    foreach my $a (@$all_analyses) {
        $self->_add_analysis_node($a);
    }
    $self->_control_rules( $all_ctrl_rules );
190
    $self->_dataflow_rules( $all_dataflow_rules, \%subgraph_allocation );
191

192
    if($self->config_get('DisplayStretched') ) {
193
194
195
196
197
198

        # The invisible edges will be linked to the destination analysis instead of the midpoint
        my $id_to_rule = {map { $_->dbID => $_ } @$all_dataflow_rules};
        my @all_fdr_id = grep {$_} (map {$_->funnel_dataflow_rule_id} @$all_dataflow_rules);
        my $midpoint_to_analysis = {map { _midpoint_name( $_ ) => _analysis_node_name( $id_to_rule->{$_}->to_analysis->dbID ) } @all_fdr_id};

199
        while( my($from, $to) = each %subgraph_allocation) {
200
201
            if($to && $from=~/^analysis/) {
                $self->graph->add_edge( $from => $to,
202
203
204
205
206
207
208
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

209
    if($self->config_get('DisplaySemaphoreBoxes') ) {
210
        $self->graph->subgraphs( \%subgraph_allocation );
211
212
        $self->graph->colour_scheme( $self->config_get('Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config_get('Box', 'ColourOffset') );
213
214
215
216
217
218
219
    }

    return $self->graph();
}


sub _allocate_to_subgraph {
220
    my ($self, $outflow_rules, $dfr_flows_into_node, $source_analysis_node_name, $subgraph_allocation ) = @_;
221

222
    my $source_analysis_allocation = $subgraph_allocation->{ $source_analysis_node_name };  # for some analyses it will be undef
223

224
225
226
    foreach my $rule ( @{ $outflow_rules->{$source_analysis_node_name} } ) {
        my $target_object                 = $rule->to_analysis();
        my $target_node_name;
227

228
        if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
229
            $target_node_name = _analysis_node_name( $rule->to_analysis->dbID() );
230
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::NakedTable')) {
231
232
            $target_node_name = _table_node_name($target_object->table_name()) . '_' .
                ($self->config_get('DuplicateTables') ?  $rule->from_analysis_id() : ($source_analysis_allocation||''));
233
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
234
            next;
235
236
237
        } else {
            warn('Do not know how to handle the type '.ref($target_object));
            next;
238
        }
239

240
241
242
243
        my $proposed_allocation;    # will depend on whether we start a new semaphore
        my $funnel_dataflow_rule_id  = $rule->funnel_dataflow_rule_id();
        if( $funnel_dataflow_rule_id ) {
            $proposed_allocation =
244
245
#                $dfr_flows_into_node->{$funnel_dataflow_rule_id};   # if we do start a new semaphore, report to the new funnel (based on common funnel's analysis name)
                _midpoint_name( $funnel_dataflow_rule_id );       # if we do start a new semaphore, report to the new funnel (based on common funnel rule's midpoint)
246
247
248
249
250

            my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
            $subgraph_allocation->{ $fan_midpoint_name } = $proposed_allocation;

            my $funnel_midpoint_name = _midpoint_name( $funnel_dataflow_rule_id );
251
252
253
            $subgraph_allocation->{ $funnel_midpoint_name } = $source_analysis_allocation;   # draw the funnel's midpoint outside of the box
        } else {
            $proposed_allocation = $source_analysis_allocation;   # if we don't start a new semaphore, inherit the allocation of the source
254
        }
255
256
257
            # we allocate on first-come basis at the moment:
        if( exists $subgraph_allocation->{ $target_node_name } ) {  # already allocated?
            my $known_allocation = $subgraph_allocation->{ $target_node_name } || '';
258
259
260
            $proposed_allocation ||= '';

            if( $known_allocation eq $proposed_allocation) {
261
                # warn "analysis '$target_node_name' has already been allocated to the same '$known_allocation' by another branch";
262
            } else {
263
264
265
266
267
268
                # warn "analysis '$target_node_name' has already been allocated to '$known_allocation' however this branch would allocate it to '$proposed_allocation'";
            }

            if($funnel_dataflow_rule_id) {  # correction for multiple entries into the same box (probably needs re-thinking)
                my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
                $subgraph_allocation->{ $fan_midpoint_name } = $subgraph_allocation->{ $target_node_name };
269
270
271
            }

        } else {
272
273
            # warn "allocating analysis '$target_node_name' to '$proposed_allocation'";
            $subgraph_allocation->{ $target_node_name } = $proposed_allocation;
274

275
            $self->_allocate_to_subgraph( $outflow_rules, $dfr_flows_into_node, $target_node_name, $subgraph_allocation );
276
277
        }
    }
278
279
}

280

281
282
sub _add_hive_details {
  my ($self) = @_;
283

284
  my $node_fontname  = $self->config_get('Node', 'Details', 'Font');
285

286
  if( $self->config_get('DisplayDetails') ) {
287
    my $dbc = $self->dba()->dbc();
288
    my $label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
289
    $self->graph()->add_node( 'Details',
290
291
292
      label     => $label,
      fontname  => $node_fontname,
      shape     => 'plaintext',
293
294
295
296
    );
  }
}

297

298
sub _add_analysis_node {
299
    my ($self, $analysis) = @_;
300

301
    my $analysis_stats = $analysis->stats();
302

303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
    my ($breakout_label, $total_job_count, $count_hash)   = $analysis_stats->job_count_breakout();
    my $analysis_status                                   = $analysis_stats->status;
    my $analysis_status_colour                            = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Colour');
    my $style                                             = $analysis->can_be_empty() ? 'dashed, filled' : 'filled' ;
    my $node_fontname                                     = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Font');
    my $display_stats                                     = $self->config_get('DisplayStats');

    my $colspan = 0;
    my $bar_chart = '';

    if( $display_stats eq 'barchart' ) {
        foreach my $count_method (qw(SEMAPHORED READY INPROGRESS DONE FAILED)) {
            if(my $count=$count_hash->{lc($count_method).'_job_count'}) {
                $bar_chart .= '<td bgcolor="'.$self->config_get('Node', 'JobStatus', $count_method, 'Colour').'" width="'.int(100*$count/$total_job_count).'%">'.$count.lc(substr($count_method,0,1)).'</td>';
                ++$colspan;
            }
        }
        if($colspan != 1) {
            $bar_chart .= '<td>='.$total_job_count.'</td>';
            ++$colspan;
        }
    }

    $colspan ||= 1;
    my $analysis_label  = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.$colspan.'">'.$analysis->logic_name().' ('.$analysis->dbID().')</td></tr>';
    if( $display_stats ) {
        $analysis_label    .= qq{<tr><td colspan="$colspan"> </td></tr>};
        if( $display_stats eq 'barchart') {
            $analysis_label    .= qq{<tr>$bar_chart</tr>};
        } elsif( $display_stats eq 'text') {
            $analysis_label    .= qq{<tr><td colspan="$colspan">$breakout_label</td></tr>};
        }
    }

337
    if( my $job_limit = $self->config_get('DisplayJobs') ) {
338
        my $adaptor = $self->dba->get_AnalysisJobAdaptor();
339
340
341
342
343
344
345
        my @jobs = sort {$a->dbID <=> $b->dbID} @{ $adaptor->fetch_some_by_analysis_id_limit( $analysis->dbID, $job_limit+1 )};

        my $hit_limit;
        if(scalar(@jobs)>$job_limit) {
            pop @jobs;
            $hit_limit = 1;
        }
346
347
348
349
350
351
352
353
354
355
356

        $analysis_label    .= '<tr><td colspan="'.$colspan.'"> </td></tr>';
        foreach my $job (@jobs) {
            my $input_id = $job->input_id;
            my $status   = $job->status;
            my $job_id   = $job->dbID;
            $input_id=~s/\>/&gt;/g;
            $input_id=~s/\</&lt;/g;
            $input_id=~s/\{|\}//g;
            $analysis_label    .= qq{<tr><td colspan="$colspan" bgcolor="}.$self->config_get('Node', 'JobStatus', $status, 'Colour').qq{">$job_id [$status]: $input_id</td></tr>};
        }
357
358
359
360

        if($hit_limit) {
            $analysis_label    .= qq{<tr><td colspan="$colspan">[ and }.($total_job_count-$job_limit).qq{ more ]</td></tr>};
        }
361
362
    }
    $analysis_label    .= '</table>>';
363
  
364
365
366
367
368
369
370
    $self->graph->add_node( _analysis_node_name( $analysis->dbID() ), 
        label       => $analysis_label,
        shape       => 'record',
        fontname    => $node_fontname,
        style       => $style,
        fillcolor   => $analysis_status_colour,
    );
371
372
373
374
}


sub _control_rules {
375
  my ($self, $all_ctrl_rules) = @_;
376
  
377
  my $control_colour = $self->config_get('Edge', 'Control', 'Colour');
378
379
380
  my $graph = $self->graph();

  #The control rules are always from and to an analysis so no need to search for odd cases here
381
382
383
  foreach my $rule ( @$all_ctrl_rules ) {
    my ($from, $to) = ( _analysis_node_name( $rule->condition_analysis()->dbID() ), _analysis_node_name( $rule->ctrled_analysis()->dbID() ) );
    $graph->add_edge( $from => $to, 
384
      color => $control_colour,
385
      arrowhead => 'tee',
386
387
    );
  }
388
389
}

390

391
sub _dataflow_rules {
392
    my ($self, $all_dataflow_rules, $subgraph_allocation) = @_;
393

394
    my $graph = $self->graph();
395
396
397
398
    my $dataflow_colour     = $self->config_get('Edge', 'Data', 'Colour');
    my $semablock_colour    = $self->config_get('Edge', 'Semablock', 'Colour');
    my $accu_colour         = $self->config_get('Edge', 'Accu', 'Colour');
    my $df_edge_fontname    = $self->config_get('Edge', 'Data', 'Font');
399
400

    my %needs_a_midpoint = ();
401
402
    my %aid2aid_nonsem = ();    # simply a directed graph between numerical analysis_ids, except for semaphored rules
    foreach my $rule ( @$all_dataflow_rules ) {
403
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
404
405
406
            unless( $rule->funnel_dataflow_rule_id ) {
                $aid2aid_nonsem{$rule->from_analysis_id()}{$to_id}++;
            }
407
408
409
410
411
412
        }
        if(my $funnel_dataflow_rule_id = $rule->funnel_dataflow_rule_id()) {
            $needs_a_midpoint{$rule->dbID()}++;
            $needs_a_midpoint{$funnel_dataflow_rule_id}++;
        }
    }
413

414
    foreach my $rule ( @$all_dataflow_rules ) {
415
    
416
417
        my ($rule_id, $from_analysis_id, $branch_code, $funnel_dataflow_rule_id, $to) =
            ($rule->dbID(), $rule->from_analysis_id(), $rule->branch_code(), $rule->funnel_dataflow_rule_id(), $rule->to_analysis());
418
        my ($from_node, $to_id, $to_node) = ( _analysis_node_name($from_analysis_id)      );
419
    
420
            # Different treatment for analyses and tables:
421
        if(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::Analysis')) {
422
423
            $to_id   = $to->dbID();
            $to_node = _analysis_node_name($to_id);
424
        } elsif(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::NakedTable')) {
425
426
427
428

            $to_node = _table_node_name($to->table_name) . '_' .
                ( $self->config_get('DuplicateTables') ? $rule->from_analysis_id() : ($subgraph_allocation->{$from_node}||''));

429
            $self->_add_table_node($to_node, $to->table_name);
430
        } elsif(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::Accumulator')) {
431
432
            $to_node = $subgraph_allocation->{$from_node};

433
434
435
436
437
438
439
        } else {
            warn('Do not know how to handle the type '.ref($to));
            next;
        }

        if($needs_a_midpoint{$rule_id}) {
            my $midpoint_name = _midpoint_name($rule_id);
440

441
            $graph->add_node( $midpoint_name,   # midpoint itself
442
                color       => $dataflow_colour,
443
444
                label       => '',
                shape       => 'point',
445
446
447
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
448
            );
449
            $graph->add_edge( $from_node => $midpoint_name, # first half of the two-part arrow
450
                color       => $dataflow_colour,
451
                arrowhead   => 'none',
452
                fontname    => $df_edge_fontname,
453
454
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
455
            );
456
            $graph->add_edge( $midpoint_name => $to_node,   # second half of the two-part arrow
457
                color     => $dataflow_colour,
458
459
            );
            if($funnel_dataflow_rule_id) {
460
                $graph->add_edge( $midpoint_name => _midpoint_name($funnel_dataflow_rule_id),   # semaphore inter-rule link
461
                    color     => $semablock_colour,
462
463
464
465
466
467
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
468
        } elsif(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::Accumulator')) {
469
470
471
472
473
474
475
476
477
478
                # one-part dashed arrow:
            $graph->add_edge( $from_node => $to_node,
                color       => $accu_colour,
                style       => 'dashed',
                label       => $to->struct_name().'#'.$branch_code,
                fontname    => $df_edge_fontname,
                fontcolor   => $accu_colour,
                dir         => 'both',
                arrowtail   => 'crow',
            );
479
        } else {
480
                # one-part solid arrow:
481
            $graph->add_edge( $from_node => $to_node, 
482
                color       => $dataflow_colour,
483
                fontname    => $df_edge_fontname,
484
485
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
486
            );
487
488
489
        } # /if($needs_a_midpoint{$rule_id})
    } # /foreach my $rule (@$all_dataflow_rules)

490
491
}

492

493
sub _add_table_node {
494
    my ($self, $table_node, $table_name) = @_;
495

496
    my $node_fontname    = $self->config_get('Node', 'Table', 'Font');
497
    my (@column_names, $columns, $table_data, $data_limit, $hit_limit);
498

499
    if( $data_limit = $self->config_get('DisplayData') ) {
500
501
        my $adaptor = $self->dba->get_NakedTableAdaptor();
        $adaptor->table_name( $table_name );
502

503
504
        @column_names = sort keys %{$adaptor->column_set};
        $columns = scalar(@column_names);
505
506
507
508
509
510
        $table_data = $adaptor->fetch_all( 'LIMIT '.($data_limit+1) );

        if(scalar(@$table_data)>$data_limit) {
            pop @$table_data;
            $hit_limit = 1;
        }
511
512
513
514
515
516
517
518
519
520
    }

    my $table_label = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.($columns||1).'">'.$table_name.'</td></tr>';

    if( $self->config_get('DisplayData') ) {
        $table_label .= '<tr><td colspan="'.$columns.'"> </td></tr>';
        $table_label .= '<tr>'.join('', map { qq{<td bgcolor="lightblue" border="1">$_</td>} } @column_names).'</tr>';
        foreach my $row (@$table_data) {
            $table_label .= '<tr>'.join('', map { qq{<td>$_</td>} } @{$row}{@column_names}).'</tr>';
        }
521
522
523
        if($hit_limit) {
            $table_label  .= qq{<tr><td colspan="$columns">[ more data ]</td></tr>};
        }
524
525
526
527
528
529
530
531
532
    }
    $table_label .= '</table>>';

    $self->graph()->add_node( $table_node, 
        label => $table_label,
        shape => 'record',
        fontname => $node_fontname,
        color => $self->config_get('Node', 'Table', 'Colour'),
    );
533
534
}

Leo Gordon's avatar
Leo Gordon committed
535
1;