Graph.pm 17.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package Bio::EnsEMBL::Hive::Utils::Graph;

=head1 NAME

Bio::EnsEMBL::Hive::Utils::Graph

=head1 SYNOPSIS

  my $dba = get_hive_dba();
  my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $dba);
  my $graphviz = $g->build();
  $graphviz->as_png('location.png');

=head1 DESCRIPTION

This is a module for converting a hive database's flow of analyses, control 
rules and dataflows into the GraphViz model language. This information can
then be converted to an image or to the dot language for further manipulation
in GraphViz.

=head1 METHODS/SUBROUTINES

See inline

=cut

use strict;
use warnings;

use Bio::EnsEMBL::Utils::Scalar qw(check_ref assert_ref);

32
use Bio::EnsEMBL::Hive::Utils::GraphViz;
33
use Bio::EnsEMBL::Hive::Utils::Config;
34

35
36
use base ('Bio::EnsEMBL::Hive::Configurable');

37
38
39

=head2 new()

40
41
42
43
44
45
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $dba;
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
46
47
48
49
50
51
52
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
53
  my ($class, $dba, $config_file_name) = @_;
54

55
  my $self = bless({}, ref($class) || $class);
56

57
  $self->dba($dba);
58
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
59
  $self->config($config);
60
  $self->context( [ 'Graph' ] );
61

62
63
64
65
66
67
68
69
70
71
72
73
74
75
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
76
77
78
79
80
81
82
    my ($self) = @_;

    if(! exists $self->{graph}) {
        my $padding  = $self->config_get('Pad') || 0;
        $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => qq{compress"; pad = "$padding}  ); # injection hack!
    }
    return $self->{graph};
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
}


=head2 dba()

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

sub dba {
  my ($self, $dba) = @_;
  if(defined $dba) {
    assert_ref($dba, 'Bio::EnsEMBL::Hive::DBSQL::DBAdaptor');
    $self->{dba} = $dba;
  }
  return $self->{dba};
}


105
106
107
108
109
110
sub _analysis_node_name {
    my $analysis_id = shift @_;

    return 'analysis_' . $analysis_id;
}

111
112
113
114
115
116
sub _table_node_name {
    my $table_name = shift @_;

    return 'table_' . $table_name;
}

117

118
119
120
121
122
123
sub _midpoint_name {
    my $rule_id = shift @_;

    return 'dfr_'.$rule_id.'_mp';
}

124
125
126
127
128
129
130
131
132
133
134

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
135
    my ($self) = @_;
136
137
138
139
140
141
142

    my $all_analyses          = $self->dba()->get_AnalysisAdaptor()->fetch_all();
    my $all_ctrl_rules        = $self->dba()->get_AnalysisCtrlRuleAdaptor()->fetch_all();
    my $all_dataflow_rules    = $self->dba()->get_DataflowRuleAdaptor()->fetch_all();

    my %inflow_count = ();    # used to detect sources (nodes with zero inflow)
    my %outflow_rules = ();   # maps from anlaysis_node_name to a list of all dataflow rules that flow out of it
143
    my %dfr_flows_into_node = ();   # maps from dfr_id to target analysis_node_name
144
145

    foreach my $rule ( @$all_dataflow_rules ) {
146
147
148
        my $target_object = $rule->to_analysis;
        if(my $to_id = $target_object->can('dbID') && $target_object->dbID()) {
            my $to_node_name = _analysis_node_name( $to_id );
149
            $inflow_count{$to_node_name}++;
150
            $dfr_flows_into_node{$rule->dbID()} = $to_node_name;
151
152
153
154
155
156
157
158
        }
        push @{$outflow_rules{ _analysis_node_name($rule->from_analysis_id()) }}, $rule;
    }

    my %subgraph_allocation = ();

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
159
160
161
    foreach my $source_analysis_node_name ( map { _analysis_node_name( $_->dbID ) } @$all_analyses ) {
        unless($inflow_count{$source_analysis_node_name}) {    # if there is no dataflow into this analysis
            $self->_allocate_to_subgraph(\%outflow_rules, \%dfr_flows_into_node, $source_analysis_node_name, \%subgraph_allocation ); # run the recursion in each component that has a non-cyclic start
162
163
164
165
166
167
168
169
        }
    }

    $self->_add_hive_details();
    foreach my $a (@$all_analyses) {
        $self->_add_analysis_node($a);
    }
    $self->_control_rules( $all_ctrl_rules );
170
    $self->_dataflow_rules( $all_dataflow_rules, \%subgraph_allocation );
171

172
    if($self->config_get('DisplayStretched') ) {
173
174
175
176
177
178

        # The invisible edges will be linked to the destination analysis instead of the midpoint
        my $id_to_rule = {map { $_->dbID => $_ } @$all_dataflow_rules};
        my @all_fdr_id = grep {$_} (map {$_->funnel_dataflow_rule_id} @$all_dataflow_rules);
        my $midpoint_to_analysis = {map { _midpoint_name( $_ ) => _analysis_node_name( $id_to_rule->{$_}->to_analysis->dbID ) } @all_fdr_id};

179
180
        while( my($from, $to) = each %subgraph_allocation) {
            if($to) {
181
                $self->graph->add_edge( $from => $midpoint_to_analysis->{$to},
182
183
184
185
186
187
188
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

189
    if($self->config_get('DisplaySemaphoreBoxes') ) {
190
        $self->graph->subgraphs( \%subgraph_allocation );
191
192
        $self->graph->colour_scheme( $self->config_get('Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config_get('Box', 'ColourOffset') );
193
194
195
196
197
198
199
    }

    return $self->graph();
}


sub _allocate_to_subgraph {
200
    my ($self, $outflow_rules, $dfr_flows_into_node, $source_analysis_node_name, $subgraph_allocation ) = @_;
201

202
    my $source_analysis_allocation = $subgraph_allocation->{ $source_analysis_node_name };  # for some analyses it will be undef
203

204
205
206
    foreach my $rule ( @{ $outflow_rules->{$source_analysis_node_name} } ) {
        my $target_object                 = $rule->to_analysis();
        my $target_node_name;
207

208
209
210
211
212
        if ($target_object->can('dbID')) {                      # target is an analysis
            $target_node_name = _analysis_node_name( $rule->to_analysis->dbID() );
        } else {                                                # target is a table
            $target_node_name = _table_node_name($target_object->table_name()) . '_' .
                ($self->config_get('DuplicateTables') ?  $rule->from_analysis_id() : ($source_analysis_allocation||''));
213
        }
214

215
216
217
218
219
220
        my $proposed_allocation;    # will depend on whether we start a new semaphore
        my $funnel_dataflow_rule_id  = $rule->funnel_dataflow_rule_id();
        if( $funnel_dataflow_rule_id ) {
            $proposed_allocation =
                $dfr_flows_into_node->{$funnel_dataflow_rule_id};   # if we do start a new semaphore, report to the new funnel (based on common funnel's analysis name)
#                _midpoint_name( $funnel_dataflow_rule_id );       # if we do start a new semaphore, report to the new funnel (based on common funnel rule's midpoint)
221
222
223
224
225

            my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
            $subgraph_allocation->{ $fan_midpoint_name } = $proposed_allocation;

            my $funnel_midpoint_name = _midpoint_name( $funnel_dataflow_rule_id );
226
227
228
            $subgraph_allocation->{ $funnel_midpoint_name } = $source_analysis_allocation;   # draw the funnel's midpoint outside of the box
        } else {
            $proposed_allocation = $source_analysis_allocation;   # if we don't start a new semaphore, inherit the allocation of the source
229
        }
230
231
232
            # we allocate on first-come basis at the moment:
        if( exists $subgraph_allocation->{ $target_node_name } ) {  # already allocated?
            my $known_allocation = $subgraph_allocation->{ $target_node_name } || '';
233
234
235
            $proposed_allocation ||= '';

            if( $known_allocation eq $proposed_allocation) {
236
                # warn "analysis '$target_node_name' has already been allocated to the same '$known_allocation' by another branch";
237
            } else {
238
239
240
241
242
243
                # warn "analysis '$target_node_name' has already been allocated to '$known_allocation' however this branch would allocate it to '$proposed_allocation'";
            }

            if($funnel_dataflow_rule_id) {  # correction for multiple entries into the same box (probably needs re-thinking)
                my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
                $subgraph_allocation->{ $fan_midpoint_name } = $subgraph_allocation->{ $target_node_name };
244
245
246
            }

        } else {
247
248
            # warn "allocating analysis '$target_node_name' to '$proposed_allocation'";
            $subgraph_allocation->{ $target_node_name } = $proposed_allocation;
249

250
            $self->_allocate_to_subgraph( $outflow_rules, $dfr_flows_into_node, $target_node_name, $subgraph_allocation );
251
252
        }
    }
253
254
}

255

256
257
sub _add_hive_details {
  my ($self) = @_;
258

259
  my $node_fontname  = $self->config_get('Node', 'Details', 'Font');
260

261
  if( $self->config_get('DisplayDetails') ) {
262
    my $dbc = $self->dba()->dbc();
263
    my $label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
264
    $self->graph()->add_node( 'Details',
265
266
267
      label     => $label,
      fontname  => $node_fontname,
      shape     => 'plaintext',
268
269
270
271
    );
  }
}

272

273
sub _add_analysis_node {
274
    my ($self, $analysis) = @_;
275

276
    my $analysis_stats = $analysis->stats();
277

278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
    my ($breakout_label, $total_job_count, $count_hash)   = $analysis_stats->job_count_breakout();
    my $analysis_status                                   = $analysis_stats->status;
    my $analysis_status_colour                            = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Colour');
    my $style                                             = $analysis->can_be_empty() ? 'dashed, filled' : 'filled' ;
    my $node_fontname                                     = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Font');
    my $display_stats                                     = $self->config_get('DisplayStats');

    my $colspan = 0;
    my $bar_chart = '';

    if( $display_stats eq 'barchart' ) {
        foreach my $count_method (qw(SEMAPHORED READY INPROGRESS DONE FAILED)) {
            if(my $count=$count_hash->{lc($count_method).'_job_count'}) {
                $bar_chart .= '<td bgcolor="'.$self->config_get('Node', 'JobStatus', $count_method, 'Colour').'" width="'.int(100*$count/$total_job_count).'%">'.$count.lc(substr($count_method,0,1)).'</td>';
                ++$colspan;
            }
        }
        if($colspan != 1) {
            $bar_chart .= '<td>='.$total_job_count.'</td>';
            ++$colspan;
        }
    }

    $colspan ||= 1;
    my $analysis_label  = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.$colspan.'">'.$analysis->logic_name().' ('.$analysis->dbID().')</td></tr>';
    if( $display_stats ) {
        $analysis_label    .= qq{<tr><td colspan="$colspan"> </td></tr>};
        if( $display_stats eq 'barchart') {
            $analysis_label    .= qq{<tr>$bar_chart</tr>};
        } elsif( $display_stats eq 'text') {
            $analysis_label    .= qq{<tr><td colspan="$colspan">$breakout_label</td></tr>};
        }
    }

    if( $self->config_get('DisplayJobs') ) {
        my $adaptor = $self->dba->get_AnalysisJobAdaptor();
        my @jobs = sort {$a->dbID <=> $b->dbID} @{ $adaptor->fetch_all_by_analysis_id_status( $analysis->dbID )};

        $analysis_label    .= '<tr><td colspan="'.$colspan.'"> </td></tr>';
        foreach my $job (@jobs) {
            my $input_id = $job->input_id;
            my $status   = $job->status;
            my $job_id   = $job->dbID;
            $input_id=~s/\>/&gt;/g;
            $input_id=~s/\</&lt;/g;
            $input_id=~s/\{|\}//g;
            $analysis_label    .= qq{<tr><td colspan="$colspan" bgcolor="}.$self->config_get('Node', 'JobStatus', $status, 'Colour').qq{">$job_id [$status]: $input_id</td></tr>};
        }
    }
    $analysis_label    .= '</table>>';
328
  
329
330
331
332
333
334
335
    $self->graph->add_node( _analysis_node_name( $analysis->dbID() ), 
        label       => $analysis_label,
        shape       => 'record',
        fontname    => $node_fontname,
        style       => $style,
        fillcolor   => $analysis_status_colour,
    );
336
337
338
339
}


sub _control_rules {
340
  my ($self, $all_ctrl_rules) = @_;
341
  
342
  my $control_colour = $self->config_get('Edge', 'Control', 'Colour');
343
344
345
  my $graph = $self->graph();

  #The control rules are always from and to an analysis so no need to search for odd cases here
346
347
348
  foreach my $rule ( @$all_ctrl_rules ) {
    my ($from, $to) = ( _analysis_node_name( $rule->condition_analysis()->dbID() ), _analysis_node_name( $rule->ctrled_analysis()->dbID() ) );
    $graph->add_edge( $from => $to, 
349
      color => $control_colour,
350
      arrowhead => 'tee',
351
352
    );
  }
353
354
}

355

356
sub _dataflow_rules {
357
    my ($self, $all_dataflow_rules, $subgraph_allocation) = @_;
358

359
    my $graph = $self->graph();
360
361
362
    my $dataflow_colour  = $self->config_get('Edge', 'Data', 'Colour');
    my $semablock_colour = $self->config_get('Edge', 'Semablock', 'Colour');
    my $df_edge_fontname = $self->config_get('Edge', 'Data', 'Font');
363
364

    my %needs_a_midpoint = ();
365
366
    my %aid2aid_nonsem = ();    # simply a directed graph between numerical analysis_ids, except for semaphored rules
    foreach my $rule ( @$all_dataflow_rules ) {
367
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
368
369
370
            unless( $rule->funnel_dataflow_rule_id ) {
                $aid2aid_nonsem{$rule->from_analysis_id()}{$to_id}++;
            }
371
372
373
374
375
376
        }
        if(my $funnel_dataflow_rule_id = $rule->funnel_dataflow_rule_id()) {
            $needs_a_midpoint{$rule->dbID()}++;
            $needs_a_midpoint{$funnel_dataflow_rule_id}++;
        }
    }
377

378
    foreach my $rule ( @$all_dataflow_rules ) {
379
    
380
381
        my ($rule_id, $from_analysis_id, $branch_code, $funnel_dataflow_rule_id, $to) =
            ($rule->dbID(), $rule->from_analysis_id(), $rule->branch_code(), $rule->funnel_dataflow_rule_id(), $rule->to_analysis());
382
        my ($from_node, $to_id, $to_node) = ( _analysis_node_name($from_analysis_id)      );
383
    
384
            # Different treatment for analyses and tables:
385
        if(check_ref($to, 'Bio::EnsEMBL::Hive::Analysis')) {
386
387
            $to_id   = $to->dbID();
            $to_node = _analysis_node_name($to_id);
388
        } elsif(check_ref($to, 'Bio::EnsEMBL::Hive::NakedTable')) {
389
390
391
392

            $to_node = _table_node_name($to->table_name) . '_' .
                ( $self->config_get('DuplicateTables') ? $rule->from_analysis_id() : ($subgraph_allocation->{$from_node}||''));

393
            $self->_add_table_node($to_node, $to->table_name);
394
395
396
397
398
399
400
        } else {
            warn('Do not know how to handle the type '.ref($to));
            next;
        }

        if($needs_a_midpoint{$rule_id}) {
            my $midpoint_name = _midpoint_name($rule_id);
401

402
            $graph->add_node( $midpoint_name,   # midpoint itself
403
                color       => $dataflow_colour,
404
405
                label       => '',
                shape       => 'point',
406
407
408
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
409
            );
410
            $graph->add_edge( $from_node => $midpoint_name, # first half of the two-part arrow
411
                color       => $dataflow_colour,
412
413
                arrowhead   => 'none',
                label       => '#'.$branch_code, 
414
                fontname    => $df_edge_fontname,
415
            );
416
            $graph->add_edge( $midpoint_name => $to_node,   # second half of the two-part arrow
417
                color     => $dataflow_colour,
418
419
            );
            if($funnel_dataflow_rule_id) {
420
                $graph->add_edge( $midpoint_name => _midpoint_name($funnel_dataflow_rule_id),   # semaphore inter-rule link
421
                    color     => $semablock_colour,
422
423
424
425
426
427
428
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
        } else {
429
                # one-part arrow:
430
            $graph->add_edge( $from_node => $to_node, 
431
                color       => $dataflow_colour,
432
                label       => '#'.$branch_code, 
433
                fontname    => $df_edge_fontname,
434
            );
435
436
437
        } # /if($needs_a_midpoint{$rule_id})
    } # /foreach my $rule (@$all_dataflow_rules)

438
439
}

440

441
sub _add_table_node {
442
    my ($self, $table_node, $table_name) = @_;
443

444
445
    my $node_fontname    = $self->config_get('Node', 'Table', 'Font');
    my (@column_names, $columns, $table_data);
446

447
448
449
    if( $self->config_get('DisplayData') ) {
        my $adaptor = $self->dba->get_NakedTableAdaptor();
        $adaptor->table_name( $table_name );
450

451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
        @column_names = sort keys %{$adaptor->column_set};
        $columns = scalar(@column_names);
        $table_data = $adaptor->fetch_all( );
    }

    my $table_label = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.($columns||1).'">'.$table_name.'</td></tr>';

    if( $self->config_get('DisplayData') ) {
        $table_label .= '<tr><td colspan="'.$columns.'"> </td></tr>';
        $table_label .= '<tr>'.join('', map { qq{<td bgcolor="lightblue" border="1">$_</td>} } @column_names).'</tr>';
        foreach my $row (@$table_data) {
            $table_label .= '<tr>'.join('', map { qq{<td>$_</td>} } @{$row}{@column_names}).'</tr>';
        }
    }
    $table_label .= '</table>>';

    $self->graph()->add_node( $table_node, 
        label => $table_label,
        shape => 'record',
        fontname => $node_fontname,
        color => $self->config_get('Node', 'Table', 'Colour'),
    );
473
474
}

Leo Gordon's avatar
Leo Gordon committed
475
1;