Graph.pm 14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
package Bio::EnsEMBL::Hive::Utils::Graph;

=head1 NAME

Bio::EnsEMBL::Hive::Utils::Graph

=head1 SYNOPSIS

  my $dba = get_hive_dba();
  my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $dba);
  my $graphviz = $g->build();
  $graphviz->as_png('location.png');

=head1 DESCRIPTION

This is a module for converting a hive database's flow of analyses, control 
rules and dataflows into the GraphViz model language. This information can
then be converted to an image or to the dot language for further manipulation
in GraphViz.

=head1 METHODS/SUBROUTINES

See inline

=head1 AUTHOR

27
$Author: mm14 $
28
29
30

=head1 VERSION

31
$Revision: 1.19 $
32
33
34
35
36
37
38
39

=cut

use strict;
use warnings;

use Bio::EnsEMBL::Utils::Scalar qw(check_ref assert_ref);

40
use Bio::EnsEMBL::Hive::Utils::GraphViz;
41
use Bio::EnsEMBL::Hive::Utils::Config;
42

43
44
45

=head2 new()

46
47
48
49
50
51
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $dba;
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
52
53
54
55
56
57
58
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
59
  my ($class, $dba, $config_file_name) = @_;
60

61
  my $self = bless({}, ref($class) || $class);
62

63
  $self->dba($dba);
64
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
65
  $self->config($config);
66

67
68
69
70
71
72
73
74
75
76
77
78
79
80
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
81
  my ($self) = @_;
82
  if(! exists $self->{graph}) {
83
    $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => 'compress' );
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
  }
  return $self->{graph};
}


=head2 dba()

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

sub dba {
  my ($self, $dba) = @_;
  if(defined $dba) {
    assert_ref($dba, 'Bio::EnsEMBL::Hive::DBSQL::DBAdaptor');
    $self->{dba} = $dba;
  }
  return $self->{dba};
}


=head2 config()

  Arg [1] : The graph configuration object
111
  Returntype : Bio::EnsEMBL::Hive::Utils::Config.
112
113
114
115
116
117
118
119
  Exceptions : If the object given is not of the required type
  Status     : Beta

=cut

sub config {
  my ($self, $config) = @_;
  if(defined $config) {
120
    assert_ref($config, 'Bio::EnsEMBL::Hive::Utils::Config');
121
122
123
124
125
    $self->{config} = $config;
  }
  return $self->{config};
}

126
127
128
129
130
131
132
133
134
135
136
137
138

sub _analysis_node_name {
    my $analysis_id = shift @_;

    return 'analysis_' . $analysis_id;
}

sub _midpoint_name {
    my $rule_id = shift @_;

    return 'dfr_'.$rule_id.'_mp';
}

139
140
141
142
143
144
145
146
147
148
149

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
150
    my ($self) = @_;
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

    my $all_analyses          = $self->dba()->get_AnalysisAdaptor()->fetch_all();
    my $all_ctrl_rules        = $self->dba()->get_AnalysisCtrlRuleAdaptor()->fetch_all();
    my $all_dataflow_rules    = $self->dba()->get_DataflowRuleAdaptor()->fetch_all();

    my %inflow_count = ();    # used to detect sources (nodes with zero inflow)
    my %outflow_rules = ();   # maps from anlaysis_node_name to a list of all dataflow rules that flow out of it
    my %dfr_flows_into= ();   # maps from dfr_id to target analysis_node_name

    foreach my $rule ( @$all_dataflow_rules ) {
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
            my $to_node_name    = _analysis_node_name( $to_id );
            $inflow_count{$to_node_name}++;
            $dfr_flows_into{$rule->dbID()} = $to_node_name;
        }
        push @{$outflow_rules{ _analysis_node_name($rule->from_analysis_id()) }}, $rule;
    }

    my %subgraph_allocation = ();

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
    foreach my $analysis_id ( map { $_->dbID } @$all_analyses ) {
        my $analysis_node_name =  _analysis_node_name( $analysis_id );
        unless($inflow_count{$analysis_node_name}) {
176
            _allocate_to_subgraph(\%outflow_rules, \%dfr_flows_into, $analysis_node_name, \%subgraph_allocation, $self->config ); # run the recursion in each component that has a non-cyclic start
177
178
179
180
181
182
183
184
185
186
        }
    }

    $self->_add_hive_details();
    foreach my $a (@$all_analyses) {
        $self->_add_analysis_node($a);
    }
    $self->_control_rules( $all_ctrl_rules );
    $self->_dataflow_rules( $all_dataflow_rules );

187
    if($self->config->get('Graph', 'DisplayStretched') ) {
188
189
190
191
192
193
194
195
196
197
        while( my($from, $to) = each %subgraph_allocation) {
            if($to) {
                $self->graph->add_edge( $from => $to,
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

198
    if($self->config->get('Graph', 'DisplaySemaphoreBoxes') ) {
199
        $self->graph->subgraphs( \%subgraph_allocation );
200
201
        $self->graph->colour_scheme( $self->config->get('Graph', 'Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config->get('Graph', 'Box', 'ColourOffset') );
202
203
204
205
206
207
208
    }

    return $self->graph();
}


sub _allocate_to_subgraph {
209
    my ( $outflow_rules, $dfr_flows_into, $parent_analysis_node_name, $subgraph_allocation, $config ) = @_;
210
211
212
213
214

    my $parent_allocation = $subgraph_allocation->{ $parent_analysis_node_name };  # for some analyses it will be undef

    foreach my $rule ( @{ $outflow_rules->{$parent_analysis_node_name} } ) {
        my $to_analysis                 = $rule->to_analysis();
215
        next unless $to_analysis->can('dbID') or $config->get('Graph', 'DuplicateTables');
216

217
218
219
220
221
222
223
        my $this_analysis_node_name;
        if ($to_analysis->can('dbID')) {
            $this_analysis_node_name = _analysis_node_name( $rule->to_analysis->dbID() );
        } else {
            $this_analysis_node_name = $to_analysis->table_name();
            $this_analysis_node_name .= '_'.$rule->from_analysis_id() if $config->get('Graph', 'DuplicateTables');
        }
224
225
226
        my $funnel_dataflow_rule_id     = $rule->funnel_dataflow_rule_id();

        my $proposed_allocation = $funnel_dataflow_rule_id  # depends on whether we start a new semaphore
227
228
#           ? $dfr_flows_into->{$funnel_dataflow_rule_id}       # if we do, report to the new funnel (based on common funnel's analysis name)
            ? _midpoint_name( $funnel_dataflow_rule_id )        # if we do, report to the new funnel (based on common funnel rule's midpoint)
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
            : $parent_allocation;                               # it we don't, inherit the parent's funnel

        if($funnel_dataflow_rule_id) {
            my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
            $subgraph_allocation->{ $fan_midpoint_name } = $proposed_allocation;

            my $funnel_midpoint_name = _midpoint_name( $funnel_dataflow_rule_id );
            $subgraph_allocation->{ $funnel_midpoint_name } = $parent_allocation;   # draw the funnel's midpoint outside of the box
        }
        if( exists $subgraph_allocation->{ $this_analysis_node_name } ) {        # we allocate on first-come basis at the moment
            my $known_allocation = $subgraph_allocation->{ $this_analysis_node_name } || '';
            $proposed_allocation ||= '';

            if( $known_allocation eq $proposed_allocation) {
                # warn "analysis '$this_analysis_node_name' has already been allocated to the same '$known_allocation' by another branch";
            } else {
                # warn "analysis '$this_analysis_node_name' has already been allocated to '$known_allocation' however this branch would allocate it to '$proposed_allocation'";
            }

        } else {
            # warn "allocating analysis '$this_analysis_node_name' to '$proposed_allocation'";
            $subgraph_allocation->{ $this_analysis_node_name } = $proposed_allocation;

252
            _allocate_to_subgraph( $outflow_rules, $dfr_flows_into, $this_analysis_node_name, $subgraph_allocation, $config );
253
254
        }
    }
255
256
}

257

258
259
sub _add_hive_details {
  my ($self) = @_;
260

261
  my $node_fontname  = $self->config->get('Graph', 'Node', 'Details', 'Font');
262
263

  if($self->config->get('Graph', 'DisplayDetails') ) {
264
    my $dbc = $self->dba()->dbc();
265
    my $label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
266
    $self->graph()->add_node( 'Details',
267
268
269
      label     => $label,
      fontname  => $node_fontname,
      shape     => 'plaintext',
270
271
272
273
    );
  }
}

274

275
276
277
278
279
280
281
sub _add_analysis_node {
  my ($self, $a) = @_;
  my $graph = $self->graph();
  
  #Check we can invoke it & then check if it was able to be empty
  my $can_be_empty = $a->stats()->can('can_be_empty') && $a->stats()->can_be_empty();
  my $shape = ($can_be_empty) ? 'doubleoctagon' : 'ellipse' ;
282

283
284
  my $status_colour = $self->config->get('Graph', 'Node', $a->stats->status, 'Colour');
  my $node_fontname  = $self->config->get('Graph', 'Node', $a->stats->status, 'Font');
285
  
286
  $graph->add_node( _analysis_node_name( $a->dbID() ), 
287
288
289
    label       => $a->logic_name().' ('.$a->dbID().')\n'.$a->stats()->done_job_count().'+'.$a->stats()->remaining_job_count().'='.$a->stats()->total_job_count(), 
    shape       => $shape,
    style       => 'filled',
290
291
    fontname    => $node_fontname,
    fillcolor   => $status_colour,
292
293
294
295
296
  );
}


sub _control_rules {
297
  my ($self, $all_ctrl_rules) = @_;
298
  
299
  my $control_colour = $self->config->get('Graph', 'Edge', 'Control', 'Colour');
300
301
302
  my $graph = $self->graph();

  #The control rules are always from and to an analysis so no need to search for odd cases here
303
304
305
  foreach my $rule ( @$all_ctrl_rules ) {
    my ($from, $to) = ( _analysis_node_name( $rule->condition_analysis()->dbID() ), _analysis_node_name( $rule->ctrled_analysis()->dbID() ) );
    $graph->add_edge( $from => $to, 
306
      color => $control_colour,
307
      arrowhead => 'tee',
308
309
    );
  }
310
311
}

312

313
sub _dataflow_rules {
314
315
    my ($self, $all_dataflow_rules) = @_;

316
    my $graph = $self->graph();
317
318
319
    my $dataflow_colour  = $self->config->get('Graph', 'Edge', 'Data', 'Colour');
    my $semablock_colour = $self->config->get('Graph', 'Edge', 'Semablock', 'Colour');
    my $df_edge_fontname    = $self->config->get('Graph', 'Edge', 'Data', 'Font');
320
321

    my %needs_a_midpoint = ();
322
323
    my %aid2aid_nonsem = ();    # simply a directed graph between numerical analysis_ids, except for semaphored rules
    foreach my $rule ( @$all_dataflow_rules ) {
324
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
325
326
327
            unless( $rule->funnel_dataflow_rule_id ) {
                $aid2aid_nonsem{$rule->from_analysis_id()}{$to_id}++;
            }
328
329
330
331
332
333
        }
        if(my $funnel_dataflow_rule_id = $rule->funnel_dataflow_rule_id()) {
            $needs_a_midpoint{$rule->dbID()}++;
            $needs_a_midpoint{$funnel_dataflow_rule_id}++;
        }
    }
334

335
    foreach my $rule ( @$all_dataflow_rules ) {
336
    
337
338
        my ($rule_id, $from_analysis_id, $branch_code, $funnel_dataflow_rule_id, $to) =
            ($rule->dbID(), $rule->from_analysis_id(), $rule->branch_code(), $rule->funnel_dataflow_rule_id(), $rule->to_analysis());
339
        my ($from_node, $to_id, $to_node) = ( _analysis_node_name($from_analysis_id)      );
340
    
341
342
            # Different treatment for analyses and tables:
        if(check_ref($to, 'Bio::EnsEMBL::Analysis')) {
343
344
            $to_id   = $to->dbID();
            $to_node = _analysis_node_name($to_id);
345
346
        } elsif(check_ref($to, 'Bio::EnsEMBL::Hive::NakedTable')) {
            $to_node = $to->table_name();
347
            $to_node .= '_'.$from_analysis_id if $self->config->get('Graph', 'DuplicateTables');
348
349
350
351
352
353
354
355
            $self->_add_table_node($to_node);
        } else {
            warn('Do not know how to handle the type '.ref($to));
            next;
        }

        if($needs_a_midpoint{$rule_id}) {
            my $midpoint_name = _midpoint_name($rule_id);
356

357
            $graph->add_node( $midpoint_name,   # midpoint itself
358
                color       => $dataflow_colour,
359
360
                label       => '',
                shape       => 'point',
361
362
363
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
364
            );
365
            $graph->add_edge( $from_node => $midpoint_name, # first half of the two-part arrow
366
                color       => $dataflow_colour,
367
368
                arrowhead   => 'none',
                label       => '#'.$branch_code, 
369
                fontname    => $df_edge_fontname,
370
            );
371
            $graph->add_edge( $midpoint_name => $to_node,   # second half of the two-part arrow
372
                color     => $dataflow_colour,
373
374
            );
            if($funnel_dataflow_rule_id) {
375
                $graph->add_edge( $midpoint_name => _midpoint_name($funnel_dataflow_rule_id),   # semaphore inter-rule link
376
                    color     => $semablock_colour,
377
378
379
380
381
382
383
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
        } else {
384
                # one-part arrow:
385
            $graph->add_edge( $from_node => $to_node, 
386
                color       => $dataflow_colour,
387
                label       => '#'.$branch_code, 
388
                fontname    => $df_edge_fontname,
389
            );
390
391
392
        } # /if($needs_a_midpoint{$rule_id})
    } # /foreach my $rule (@$all_dataflow_rules)

393
394
}

395

396
397
sub _add_table_node {
  my ($self, $table) = @_;
398
399
400

  my $node_fontname    = $self->config->get('Graph', 'Node', 'Table', 'Font');

401
402
403
404
405
406
  my $table_name = $table;
  if ($self->config->get('Graph', 'DuplicateTables')) {
    $table =~ /^(.*)_([^_]*)$/;
    $table_name = $1;
  }

407
  $self->graph()->add_node( $table, 
408
    label => $table_name.'\n', 
409
    shape => 'tab',
410
411
    fontname => $node_fontname,
    color => $self->config->get('Graph', 'Node', 'Table', 'Colour'),
412
413
414
  );
}

Leo Gordon's avatar
Leo Gordon committed
415
1;