Graph.pm 14.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package Bio::EnsEMBL::Hive::Utils::Graph;

=head1 NAME

Bio::EnsEMBL::Hive::Utils::Graph

=head1 SYNOPSIS

  my $dba = get_hive_dba();
  my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $dba);
  my $graphviz = $g->build();
  $graphviz->as_png('location.png');

=head1 DESCRIPTION

This is a module for converting a hive database's flow of analyses, control 
rules and dataflows into the GraphViz model language. This information can
then be converted to an image or to the dot language for further manipulation
in GraphViz.

=head1 METHODS/SUBROUTINES

See inline

=cut

use strict;
use warnings;

use Bio::EnsEMBL::Utils::Scalar qw(check_ref assert_ref);

32
use Bio::EnsEMBL::Hive::Utils::GraphViz;
33
use Bio::EnsEMBL::Hive::Utils::Config;
34

35
36
37

=head2 new()

38
39
40
41
42
43
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $dba;
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
44
45
46
47
48
49
50
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
51
  my ($class, $dba, $config_file_name) = @_;
52

53
  my $self = bless({}, ref($class) || $class);
54

55
  $self->dba($dba);
56
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
57
  $self->config($config);
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
73
  my ($self) = @_;
74
  if(! exists $self->{graph}) {
75
    $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => 'compress' );
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
  }
  return $self->{graph};
}


=head2 dba()

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

sub dba {
  my ($self, $dba) = @_;
  if(defined $dba) {
    assert_ref($dba, 'Bio::EnsEMBL::Hive::DBSQL::DBAdaptor');
    $self->{dba} = $dba;
  }
  return $self->{dba};
}


=head2 config()

  Arg [1] : The graph configuration object
103
  Returntype : Bio::EnsEMBL::Hive::Utils::Config.
104
105
106
107
108
109
110
111
  Exceptions : If the object given is not of the required type
  Status     : Beta

=cut

sub config {
  my ($self, $config) = @_;
  if(defined $config) {
112
    assert_ref($config, 'Bio::EnsEMBL::Hive::Utils::Config');
113
114
115
116
117
    $self->{config} = $config;
  }
  return $self->{config};
}

118
119
120
121
122
123
124
125
126
127
128
129
130

sub _analysis_node_name {
    my $analysis_id = shift @_;

    return 'analysis_' . $analysis_id;
}

sub _midpoint_name {
    my $rule_id = shift @_;

    return 'dfr_'.$rule_id.'_mp';
}

131
132
133
134
135
136
137
138
139
140
141

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
142
    my ($self) = @_;
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

    my $all_analyses          = $self->dba()->get_AnalysisAdaptor()->fetch_all();
    my $all_ctrl_rules        = $self->dba()->get_AnalysisCtrlRuleAdaptor()->fetch_all();
    my $all_dataflow_rules    = $self->dba()->get_DataflowRuleAdaptor()->fetch_all();

    my %inflow_count = ();    # used to detect sources (nodes with zero inflow)
    my %outflow_rules = ();   # maps from anlaysis_node_name to a list of all dataflow rules that flow out of it
    my %dfr_flows_into= ();   # maps from dfr_id to target analysis_node_name

    foreach my $rule ( @$all_dataflow_rules ) {
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
            my $to_node_name    = _analysis_node_name( $to_id );
            $inflow_count{$to_node_name}++;
            $dfr_flows_into{$rule->dbID()} = $to_node_name;
        }
        push @{$outflow_rules{ _analysis_node_name($rule->from_analysis_id()) }}, $rule;
    }

    my %subgraph_allocation = ();

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
    foreach my $analysis_id ( map { $_->dbID } @$all_analyses ) {
        my $analysis_node_name =  _analysis_node_name( $analysis_id );
        unless($inflow_count{$analysis_node_name}) {
168
            $self->_allocate_to_subgraph(\%outflow_rules, \%dfr_flows_into, $analysis_node_name, \%subgraph_allocation ); # run the recursion in each component that has a non-cyclic start
169
170
171
172
173
174
175
176
177
178
        }
    }

    $self->_add_hive_details();
    foreach my $a (@$all_analyses) {
        $self->_add_analysis_node($a);
    }
    $self->_control_rules( $all_ctrl_rules );
    $self->_dataflow_rules( $all_dataflow_rules );

179
    if($self->config->get('Graph', 'DisplayStretched') ) {
180
181
182
183
184
185

        # The invisible edges will be linked to the destination analysis instead of the midpoint
        my $id_to_rule = {map { $_->dbID => $_ } @$all_dataflow_rules};
        my @all_fdr_id = grep {$_} (map {$_->funnel_dataflow_rule_id} @$all_dataflow_rules);
        my $midpoint_to_analysis = {map { _midpoint_name( $_ ) => _analysis_node_name( $id_to_rule->{$_}->to_analysis->dbID ) } @all_fdr_id};

186
187
        while( my($from, $to) = each %subgraph_allocation) {
            if($to) {
188
                $self->graph->add_edge( $from => $midpoint_to_analysis->{$to},
189
190
191
192
193
194
195
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

196
    if($self->config->get('Graph', 'DisplaySemaphoreBoxes') ) {
197
        $self->graph->subgraphs( \%subgraph_allocation );
198
199
        $self->graph->colour_scheme( $self->config->get('Graph', 'Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config->get('Graph', 'Box', 'ColourOffset') );
200
201
202
203
204
205
206
    }

    return $self->graph();
}


sub _allocate_to_subgraph {
207
    my ($self, $outflow_rules, $dfr_flows_into, $parent_analysis_node_name, $subgraph_allocation ) = @_;
208
209

    my $parent_allocation = $subgraph_allocation->{ $parent_analysis_node_name };  # for some analyses it will be undef
210
    my $config = $self->config();
211
212
213

    foreach my $rule ( @{ $outflow_rules->{$parent_analysis_node_name} } ) {
        my $to_analysis                 = $rule->to_analysis();
214
        next unless $to_analysis->can('dbID') or $config->get('Graph', 'DuplicateTables');
215

216
217
218
219
220
221
222
        my $this_analysis_node_name;
        if ($to_analysis->can('dbID')) {
            $this_analysis_node_name = _analysis_node_name( $rule->to_analysis->dbID() );
        } else {
            $this_analysis_node_name = $to_analysis->table_name();
            $this_analysis_node_name .= '_'.$rule->from_analysis_id() if $config->get('Graph', 'DuplicateTables');
        }
223
224
225
        my $funnel_dataflow_rule_id     = $rule->funnel_dataflow_rule_id();

        my $proposed_allocation = $funnel_dataflow_rule_id  # depends on whether we start a new semaphore
226
227
#           ? $dfr_flows_into->{$funnel_dataflow_rule_id}       # if we do, report to the new funnel (based on common funnel's analysis name)
            ? _midpoint_name( $funnel_dataflow_rule_id )        # if we do, report to the new funnel (based on common funnel rule's midpoint)
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
            : $parent_allocation;                               # it we don't, inherit the parent's funnel

        if($funnel_dataflow_rule_id) {
            my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
            $subgraph_allocation->{ $fan_midpoint_name } = $proposed_allocation;

            my $funnel_midpoint_name = _midpoint_name( $funnel_dataflow_rule_id );
            $subgraph_allocation->{ $funnel_midpoint_name } = $parent_allocation;   # draw the funnel's midpoint outside of the box
        }
        if( exists $subgraph_allocation->{ $this_analysis_node_name } ) {        # we allocate on first-come basis at the moment
            my $known_allocation = $subgraph_allocation->{ $this_analysis_node_name } || '';
            $proposed_allocation ||= '';

            if( $known_allocation eq $proposed_allocation) {
                # warn "analysis '$this_analysis_node_name' has already been allocated to the same '$known_allocation' by another branch";
            } else {
                # warn "analysis '$this_analysis_node_name' has already been allocated to '$known_allocation' however this branch would allocate it to '$proposed_allocation'";
            }

        } else {
            # warn "allocating analysis '$this_analysis_node_name' to '$proposed_allocation'";
            $subgraph_allocation->{ $this_analysis_node_name } = $proposed_allocation;

251
            $self->_allocate_to_subgraph( $outflow_rules, $dfr_flows_into, $this_analysis_node_name, $subgraph_allocation );
252
253
        }
    }
254
255
}

256

257
258
sub _add_hive_details {
  my ($self) = @_;
259

260
  my $node_fontname  = $self->config->get('Graph', 'Node', 'Details', 'Font');
261
262

  if($self->config->get('Graph', 'DisplayDetails') ) {
263
    my $dbc = $self->dba()->dbc();
264
    my $label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
265
    $self->graph()->add_node( 'Details',
266
267
268
      label     => $label,
      fontname  => $node_fontname,
      shape     => 'plaintext',
269
270
271
272
    );
  }
}

273

274
275
sub _add_analysis_node {
  my ($self, $a) = @_;
276
277

  my $stats = $a->stats();
278
  
279
280
281
282
283
284
  my $analysis_label    = $a->logic_name().' ('.$a->dbID().')\n'.$stats->job_count_breakout();
  my $shape             = $a->can_be_empty() ? 'doubleoctagon' : 'ellipse' ;
  my $status_colour     = $self->config->get('Graph', 'Node', $stats->status, 'Colour');
  my $node_fontname     = $self->config->get('Graph', 'Node', $stats->status, 'Font');
  
  $self->graph->add_node( _analysis_node_name( $a->dbID() ), 
285
    label       => $analysis_label,
286
287
    shape       => $shape,
    style       => 'filled',
288
289
    fontname    => $node_fontname,
    fillcolor   => $status_colour,
290
291
292
293
294
  );
}


sub _control_rules {
295
  my ($self, $all_ctrl_rules) = @_;
296
  
297
  my $control_colour = $self->config->get('Graph', 'Edge', 'Control', 'Colour');
298
299
300
  my $graph = $self->graph();

  #The control rules are always from and to an analysis so no need to search for odd cases here
301
302
303
  foreach my $rule ( @$all_ctrl_rules ) {
    my ($from, $to) = ( _analysis_node_name( $rule->condition_analysis()->dbID() ), _analysis_node_name( $rule->ctrled_analysis()->dbID() ) );
    $graph->add_edge( $from => $to, 
304
      color => $control_colour,
305
      arrowhead => 'tee',
306
307
    );
  }
308
309
}

310

311
sub _dataflow_rules {
312
313
    my ($self, $all_dataflow_rules) = @_;

314
    my $graph = $self->graph();
315
316
317
    my $dataflow_colour  = $self->config->get('Graph', 'Edge', 'Data', 'Colour');
    my $semablock_colour = $self->config->get('Graph', 'Edge', 'Semablock', 'Colour');
    my $df_edge_fontname    = $self->config->get('Graph', 'Edge', 'Data', 'Font');
318
319

    my %needs_a_midpoint = ();
320
321
    my %aid2aid_nonsem = ();    # simply a directed graph between numerical analysis_ids, except for semaphored rules
    foreach my $rule ( @$all_dataflow_rules ) {
322
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
323
324
325
            unless( $rule->funnel_dataflow_rule_id ) {
                $aid2aid_nonsem{$rule->from_analysis_id()}{$to_id}++;
            }
326
327
328
329
330
331
        }
        if(my $funnel_dataflow_rule_id = $rule->funnel_dataflow_rule_id()) {
            $needs_a_midpoint{$rule->dbID()}++;
            $needs_a_midpoint{$funnel_dataflow_rule_id}++;
        }
    }
332

333
    foreach my $rule ( @$all_dataflow_rules ) {
334
    
335
336
        my ($rule_id, $from_analysis_id, $branch_code, $funnel_dataflow_rule_id, $to) =
            ($rule->dbID(), $rule->from_analysis_id(), $rule->branch_code(), $rule->funnel_dataflow_rule_id(), $rule->to_analysis());
337
        my ($from_node, $to_id, $to_node) = ( _analysis_node_name($from_analysis_id)      );
338
    
339
            # Different treatment for analyses and tables:
340
        if(check_ref($to, 'Bio::EnsEMBL::Hive::Analysis')) {
341
342
            $to_id   = $to->dbID();
            $to_node = _analysis_node_name($to_id);
343
344
        } elsif(check_ref($to, 'Bio::EnsEMBL::Hive::NakedTable')) {
            $to_node = $to->table_name();
345
            $to_node .= '_'.$from_analysis_id if $self->config->get('Graph', 'DuplicateTables');
346
347
348
349
350
351
352
353
            $self->_add_table_node($to_node);
        } else {
            warn('Do not know how to handle the type '.ref($to));
            next;
        }

        if($needs_a_midpoint{$rule_id}) {
            my $midpoint_name = _midpoint_name($rule_id);
354

355
            $graph->add_node( $midpoint_name,   # midpoint itself
356
                color       => $dataflow_colour,
357
358
                label       => '',
                shape       => 'point',
359
360
361
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
362
            );
363
            $graph->add_edge( $from_node => $midpoint_name, # first half of the two-part arrow
364
                color       => $dataflow_colour,
365
366
                arrowhead   => 'none',
                label       => '#'.$branch_code, 
367
                fontname    => $df_edge_fontname,
368
            );
369
            $graph->add_edge( $midpoint_name => $to_node,   # second half of the two-part arrow
370
                color     => $dataflow_colour,
371
372
            );
            if($funnel_dataflow_rule_id) {
373
                $graph->add_edge( $midpoint_name => _midpoint_name($funnel_dataflow_rule_id),   # semaphore inter-rule link
374
                    color     => $semablock_colour,
375
376
377
378
379
380
381
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
        } else {
382
                # one-part arrow:
383
            $graph->add_edge( $from_node => $to_node, 
384
                color       => $dataflow_colour,
385
                label       => '#'.$branch_code, 
386
                fontname    => $df_edge_fontname,
387
            );
388
389
390
        } # /if($needs_a_midpoint{$rule_id})
    } # /foreach my $rule (@$all_dataflow_rules)

391
392
}

393

394
395
sub _add_table_node {
  my ($self, $table) = @_;
396
397
398

  my $node_fontname    = $self->config->get('Graph', 'Node', 'Table', 'Font');

399
400
401
402
403
404
  my $table_name = $table;
  if ($self->config->get('Graph', 'DuplicateTables')) {
    $table =~ /^(.*)_([^_]*)$/;
    $table_name = $1;
  }

405
  $self->graph()->add_node( $table, 
406
    label => $table_name.'\n', 
407
    shape => 'tab',
408
409
    fontname => $node_fontname,
    color => $self->config->get('Graph', 'Node', 'Table', 'Colour'),
410
411
412
  );
}

Leo Gordon's avatar
Leo Gordon committed
413
1;