Graph.pm 21.1 KB
Newer Older
1
=pod
2 3 4

=head1 NAME

5
    Bio::EnsEMBL::Hive::Utils::Graph
6 7 8

=head1 SYNOPSIS

9 10 11 12
    my $dba = get_hive_dba();
    my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $dba);
    my $graphviz = $g->build();
    $graphviz->as_png('location.png');
13 14 15

=head1 DESCRIPTION

16 17 18 19 20 21 22
    This is a module for converting a hive database's flow of analyses, control 
    rules and dataflows into the GraphViz model language. This information can
    then be converted to an image or to the dot language for further manipulation
    in GraphViz.

=head1 LICENSE

23
    Copyright [1999-2014] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
24 25 26 27 28 29 30 31 32

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.
33

34
=head1 CONTACT
35

36
  Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
37 38 39 40 41

=head1 APPENDIX

    The rest of the documentation details each of the object methods.
    Internal methods are usually preceded with a _
42 43 44

=cut

45 46 47

package Bio::EnsEMBL::Hive::Utils::Graph;

48 49 50
use strict;
use warnings;

51
use Bio::EnsEMBL::Hive::Utils::GraphViz;
52
use Bio::EnsEMBL::Hive::Utils::Collection;
53
use Bio::EnsEMBL::Hive::Utils::Config;
54

55 56
use base ('Bio::EnsEMBL::Hive::Configurable');

57 58 59

=head2 new()

60 61 62 63 64 65
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $dba;
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
66 67 68 69 70 71 72
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
73
  my ($class, $dba, $config_file_name) = @_;
74

75
  my $self = bless({}, ref($class) || $class);
76

77
  $self->dba($dba);
78
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
79
  $self->config($config);
80
  $self->context( [ 'Graph' ] );
81

82 83 84 85 86 87 88 89 90 91 92 93 94 95
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
96 97 98 99 100 101 102
    my ($self) = @_;

    if(! exists $self->{graph}) {
        my $padding  = $self->config_get('Pad') || 0;
        $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => qq{compress"; pad = "$padding}  ); # injection hack!
    }
    return $self->{graph};
103 104 105 106 107 108 109 110 111 112 113 114 115
}


=head2 dba()

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

sub dba {
116 117 118 119 120 121 122
    my $self = shift @_;

    if(@_) {
        $self->{dba} = shift @_;
    }

    return $self->{dba};
123 124 125
}


126
sub _analysis_node_name {
127
    my ($analysis) = @_;
128

129
    return 'analysis_' . $analysis->logic_name;
130 131
}

132

133
sub _table_node_name {
134
    my ($self, $df_rule) = @_;
135

136 137
    return 'table_' . $df_rule->to_analysis->table_name .
                ($self->config_get('DuplicateTables') ?  '_'.$df_rule->from_analysis->logic_name : '');
138 139
}

140

141
sub _midpoint_name {
142
    my ($df_rule) = @_;
143

144 145 146 147 148
    if(scalar($df_rule)=~/\((\w+)\)/) {     # a unique id of a df_rule assuming dbIDs are not available
        return 'dfr_'.$1.'_mp';
    } else {
        die "Wrong argument to _midpoint_name";
    }
149 150
}

151 152 153 154 155 156 157 158 159 160 161

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
162
    my ($self) = @_;
163

164 165 166
    my $all_analyses_coll       = Bio::EnsEMBL::Hive::Utils::Collection->new( $self->dba()->get_AnalysisAdaptor()->fetch_all );
    my $all_control_rules_coll  = Bio::EnsEMBL::Hive::Utils::Collection->new( $self->dba()->get_AnalysisCtrlRuleAdaptor()->fetch_all );
    my $all_dataflow_rules_coll = Bio::EnsEMBL::Hive::Utils::Collection->new( $self->dba()->get_DataflowRuleAdaptor()->fetch_all );
167

168 169 170 171 172
    foreach my $c_rule ( $all_control_rules_coll->list ) {
        my $ctrled_analysis = $all_analyses_coll->find_one_by('dbID', $c_rule->ctrled_analysis_id );
        $c_rule->ctrled_analysis( $ctrled_analysis );
        push @{$ctrled_analysis->control_rules_collection}, $c_rule;
    }
173

174 175 176 177 178 179 180 181 182 183 184
    foreach my $df_rule ( $all_dataflow_rules_coll->list ) {
        my $from_analysis = $all_analyses_coll->find_one_by('dbID', $df_rule->from_analysis_id );
        $df_rule->from_analysis( $from_analysis );
        push @{$from_analysis->dataflow_rules_collection}, $df_rule;

        if(my $target_object = $all_analyses_coll->find_one_by('logic_name', $df_rule->to_analysis_url )) {
            $df_rule->to_analysis( $target_object );
            if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
                $target_object->{'_inflow_count'}++;
            }
        } # otherwise it may be a link out (unsupported at the moment)
185 186 187

        if( my $funnel_dataflow_rule_id  = $df_rule->funnel_dataflow_rule_id ) {
            my $funnel_dataflow_rule = $all_dataflow_rules_coll->find_one_by('dbID', $funnel_dataflow_rule_id );
188
            $funnel_dataflow_rule->{'_is_a_funnel'}++;
189 190
            $df_rule->funnel_dataflow_rule( $funnel_dataflow_rule );
        }
191 192 193 194
    }

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
195 196
    foreach my $source_analysis ( $all_analyses_coll->list ) {
        unless( $source_analysis->{'_inflow_count'} ) {    # if there is no dataflow into this analysis
197
                # run the recursion in each component that has a non-cyclic start:
198
            $self->_propagate_allocation( $source_analysis );
199 200 201
        }
    }

202 203 204
    if( $self->config_get('DisplayDetails') ) {
        $self->_add_hive_details();
    }
205 206 207 208
    foreach my $analysis ( $all_analyses_coll->list ) {
        $self->_add_analysis_node($analysis);
    }
    foreach my $analysis ( $all_analyses_coll->list ) {
209 210
        $self->_add_control_rules( $analysis->control_rules_collection );
        $self->_add_dataflow_rules( $analysis->dataflow_rules_collection );
211 212
    }

213 214 215 216 217
    if($self->config_get('DisplayStretched') ) {    # put each analysis before its' funnel midpoint
        foreach my $analysis ( $all_analyses_coll->list ) {
            if($analysis->{'_funnel_dfr'}) {    # this should only affect analyses that have a funnel
                my $from = _analysis_node_name( $analysis );
                my $to   = _midpoint_name( $analysis->{'_funnel_dfr'} );
218
                $self->graph->add_edge( $from => $to,
219 220 221 222 223 224 225
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

226
    if($self->config_get('DisplaySemaphoreBoxes') ) {
227 228 229 230 231
        my %cluster_2_nodes = ();

        foreach my $analysis ($all_analyses_coll->list) {
            if(my $funnel = $analysis->{'_funnel_dfr'}) {
                push @{$cluster_2_nodes{ _midpoint_name( $funnel ) } }, _analysis_node_name( $analysis );
232
            }
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248

            foreach my $df_rule ( @{ $analysis->dataflow_rules_collection } ) {
                if( $df_rule->{'_is_a_funnel'} and ! $df_rule->{'_funnel_dfr'} ) {

                    push @{$cluster_2_nodes{ '' }}, _midpoint_name( $df_rule );     # top-level funnels define clusters (top-level "boxes")

                } elsif( UNIVERSAL::isa($df_rule->to_analysis,'Bio::EnsEMBL::Hive::NakedTable') ) {

                    if(my $funnel = $df_rule->to_analysis->{'_funnel_dfr'}) {
                        push @{$cluster_2_nodes{ _midpoint_name( $funnel ) } }, $self->_table_node_name( $df_rule );    # table belongs to the same "box" as the dataflow source
                    }
                }

                if(my $funnel = $df_rule->{'_funnel_dfr'}) {
                    push @{$cluster_2_nodes{ _midpoint_name( $funnel ) } }, _midpoint_name( $df_rule ); # midpoints of rules that have a funnel live inside "boxes"
                }
249 250 251 252
            }
        }

        $self->graph->cluster_2_nodes( \%cluster_2_nodes );
253 254
        $self->graph->colour_scheme( $self->config_get('Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config_get('Box', 'ColourOffset') );
255 256 257 258 259 260
    }

    return $self->graph();
}


261 262
sub _propagate_allocation {
    my ($self, $source_analysis ) = @_;
263

264
    foreach my $df_rule ( @{ $source_analysis->dataflow_rules_collection } ) {    # this will only work if the analyses objects are ALL cached before loading DFRs
265
        my $target_object       = $df_rule->to_analysis();
266
        my $target_node_name;
267

268
        if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
269
            $target_node_name = _analysis_node_name( $target_object );
270
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::NakedTable')) {
271
            $target_node_name = $self->_table_node_name( $df_rule );
272
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
273
            next;
274 275 276
        } else {
            warn('Do not know how to handle the type '.ref($target_object));
            next;
277
        }
278

279 280
        my $proposed_funnel_dfr;    # will depend on whether we start a new semaphore

281
        my $funnel_dataflow_rule  = $df_rule->funnel_dataflow_rule();
282 283
        if( $funnel_dataflow_rule ) {   # if there is a new semaphore, the dfrs involved (their midpoints) will also have to be allocated
            $proposed_funnel_dfr = $funnel_dataflow_rule;       # if we do start a new semaphore, report to the new funnel (based on common funnel rule's midpoint)
284

285
            $df_rule->{'_funnel_dfr'} = $proposed_funnel_dfr;
286

287
            $funnel_dataflow_rule->{'_funnel_dfr'} = $source_analysis->{'_funnel_dfr'}; # draw the funnel's midpoint outside of the box
288
        } else {
289
            $proposed_funnel_dfr = $source_analysis->{'_funnel_dfr'} || ''; # if we don't start a new semaphore, inherit the allocation of the source
290
        }
291

292
            # we allocate on first-come basis at the moment:
293
        if( exists $target_object->{'_funnel_dfr'} ) {  # node is already allocated?
294

295 296 297 298
            my $known_funnel_dfr = $target_object->{'_funnel_dfr'};

            if( $known_funnel_dfr eq $proposed_funnel_dfr) {
                # warn "analysis '$target_node_name' has already been allocated to the same '$known_funnel_dfr' by another branch";
299
            } else {
300
                # warn "analysis '$target_node_name' has already been allocated to '$known_funnel_dfr' however this branch would allocate it to '$proposed_funnel_dfr'";
301 302
            }

303
            if($funnel_dataflow_rule) {  # correction for multiple entries into the same box (probably needs re-thinking)
304
                $df_rule->{'_funnel_dfr'} = $target_object->{'_funnel_dfr'};
305 306 307
            }

        } else {
308 309
            # warn "allocating analysis '$target_node_name' to '$proposed_funnel_dfr'";
            $target_object->{'_funnel_dfr'} = $proposed_funnel_dfr;
310

311
            if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
312
                $self->_propagate_allocation( $target_object );
313
            }
314 315
        }
    }
316 317
}

318

319
sub _add_hive_details {
320
    my ($self) = @_;
321

322
    my $node_fontname  = $self->config_get('Node', 'Details', 'Font');
323
    my $dbc = $self->dba()->dbc();
324
    my $label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
325
    $self->graph()->add_node( 'Details',
326 327 328
        label     => $label,
        fontname  => $node_fontname,
        shape     => 'plaintext',
329 330 331
    );
}

332

333
sub _add_analysis_node {
334
    my ($self, $analysis) = @_;
335

336
    my $analysis_stats = $analysis->stats();
337

338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
    my ($breakout_label, $total_job_count, $count_hash)   = $analysis_stats->job_count_breakout();
    my $analysis_status                                   = $analysis_stats->status;
    my $analysis_status_colour                            = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Colour');
    my $style                                             = $analysis->can_be_empty() ? 'dashed, filled' : 'filled' ;
    my $node_fontname                                     = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Font');
    my $display_stats                                     = $self->config_get('DisplayStats');

    my $colspan = 0;
    my $bar_chart = '';

    if( $display_stats eq 'barchart' ) {
        foreach my $count_method (qw(SEMAPHORED READY INPROGRESS DONE FAILED)) {
            if(my $count=$count_hash->{lc($count_method).'_job_count'}) {
                $bar_chart .= '<td bgcolor="'.$self->config_get('Node', 'JobStatus', $count_method, 'Colour').'" width="'.int(100*$count/$total_job_count).'%">'.$count.lc(substr($count_method,0,1)).'</td>';
                ++$colspan;
            }
        }
        if($colspan != 1) {
            $bar_chart .= '<td>='.$total_job_count.'</td>';
            ++$colspan;
        }
    }

    $colspan ||= 1;
362
    my $analysis_label  = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.$colspan.'">'.$analysis->logic_name().' ('.$analysis->dbID.')</td></tr>';
363 364 365 366 367 368 369 370 371
    if( $display_stats ) {
        $analysis_label    .= qq{<tr><td colspan="$colspan"> </td></tr>};
        if( $display_stats eq 'barchart') {
            $analysis_label    .= qq{<tr>$bar_chart</tr>};
        } elsif( $display_stats eq 'text') {
            $analysis_label    .= qq{<tr><td colspan="$colspan">$breakout_label</td></tr>};
        }
    }

372
    if( my $job_limit = $self->config_get('DisplayJobs') ) {
373
        my $adaptor = $self->dba->get_AnalysisJobAdaptor();
374 375 376 377 378 379 380
        my @jobs = sort {$a->dbID <=> $b->dbID} @{ $adaptor->fetch_some_by_analysis_id_limit( $analysis->dbID, $job_limit+1 )};

        my $hit_limit;
        if(scalar(@jobs)>$job_limit) {
            pop @jobs;
            $hit_limit = 1;
        }
381 382 383 384 385 386 387 388 389 390 391

        $analysis_label    .= '<tr><td colspan="'.$colspan.'"> </td></tr>';
        foreach my $job (@jobs) {
            my $input_id = $job->input_id;
            my $status   = $job->status;
            my $job_id   = $job->dbID;
            $input_id=~s/\>/&gt;/g;
            $input_id=~s/\</&lt;/g;
            $input_id=~s/\{|\}//g;
            $analysis_label    .= qq{<tr><td colspan="$colspan" bgcolor="}.$self->config_get('Node', 'JobStatus', $status, 'Colour').qq{">$job_id [$status]: $input_id</td></tr>};
        }
392 393 394 395

        if($hit_limit) {
            $analysis_label    .= qq{<tr><td colspan="$colspan">[ and }.($total_job_count-$job_limit).qq{ more ]</td></tr>};
        }
396 397
    }
    $analysis_label    .= '</table>>';
398
  
399
    $self->graph->add_node( _analysis_node_name( $analysis ),
400 401 402 403 404 405
        label       => $analysis_label,
        shape       => 'record',
        fontname    => $node_fontname,
        style       => $style,
        fillcolor   => $analysis_status_colour,
    );
406 407 408
}


409
sub _add_control_rules {
410
  my ($self, $ctrl_rules) = @_;
411
  
412
  my $control_colour = $self->config_get('Edge', 'Control', 'Colour');
413 414
  my $graph = $self->graph();

415
      #The control rules are always from and to an analysis so no need to search for odd cases here
416 417 418
  foreach my $c_rule ( @$ctrl_rules ) {
    my $from_node_name = _analysis_node_name( $c_rule->condition_analysis );
    my $to_node_name   = _analysis_node_name( $c_rule->ctrled_analysis );
419 420

    $graph->add_edge( $from_node_name => $to_node_name,
421
      color => $control_colour,
422
      arrowhead => 'tee',
423 424
    );
  }
425 426
}

427

428
sub _add_dataflow_rules {
429
    my ($self, $dataflow_rules) = @_;
430

431
    my $graph = $self->graph();
432 433 434 435
    my $dataflow_colour     = $self->config_get('Edge', 'Data', 'Colour');
    my $semablock_colour    = $self->config_get('Edge', 'Semablock', 'Colour');
    my $accu_colour         = $self->config_get('Edge', 'Accu', 'Colour');
    my $df_edge_fontname    = $self->config_get('Edge', 'Data', 'Font');
436

437
    foreach my $df_rule ( @$dataflow_rules ) {
438
    
439 440
        my ($from_analysis, $branch_code, $funnel_dataflow_rule, $target_object) =
            ($df_rule->from_analysis, $df_rule->branch_code, $df_rule->funnel_dataflow_rule, $df_rule->to_analysis);
441
        my $from_node_name = _analysis_node_name( $from_analysis );
442
        my $target_node_name;
443
    
444
            # Different treatment for analyses and tables:
445
        if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
446

447
            $target_node_name = _analysis_node_name( $target_object );
448

449
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::NakedTable')) {
450

451 452
            $target_node_name = $self->_table_node_name( $df_rule );
            $self->_add_table_node($target_node_name, $target_object->table_name);
453

454
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
455 456

            $target_node_name = _midpoint_name( $from_analysis->{'_funnel_dfr'} );
457

458
        } else {
459
            warn('Do not know how to handle the type '.ref($target_object));
460 461 462
            next;
        }

463 464
            # a rule needs a midpoint either if it HAS a funnel or if it IS a funnel
        if( $funnel_dataflow_rule or $df_rule->{'_is_a_funnel'} ) {
465
            my $midpoint_name = _midpoint_name( $df_rule );
466

467
            $graph->add_node( $midpoint_name,   # midpoint itself
468
                color       => $dataflow_colour,
469 470
                label       => '',
                shape       => 'point',
471 472 473
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
474
            );
475
            $graph->add_edge( $from_node_name => $midpoint_name, # first half of the two-part arrow
476
                color       => $dataflow_colour,
477
                arrowhead   => 'none',
478
                fontname    => $df_edge_fontname,
479 480
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
481
            );
482
            $graph->add_edge( $midpoint_name => $target_node_name,   # second half of the two-part arrow
483
                color     => $dataflow_colour,
484
            );
485
            if($funnel_dataflow_rule) {
486
                $graph->add_edge( $midpoint_name => _midpoint_name( $funnel_dataflow_rule ),   # semaphore inter-rule link
487
                    color     => $semablock_colour,
488 489 490 491 492 493
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
494
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
495
                # one-part dashed arrow:
496
            $graph->add_edge( $from_node_name => $target_node_name,
497 498
                color       => $accu_colour,
                style       => 'dashed',
499
                label       => $target_object->struct_name().'#'.$branch_code,
500 501 502 503 504
                fontname    => $df_edge_fontname,
                fontcolor   => $accu_colour,
                dir         => 'both',
                arrowtail   => 'crow',
            );
505
        } else {
506
                # one-part solid arrow:
507
            $graph->add_edge( $from_node_name => $target_node_name,
508
                color       => $dataflow_colour,
509
                fontname    => $df_edge_fontname,
510 511
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
512
            );
513
        } # /if( "$df_rule needs a midpoint" )
514
    } # /foreach my $df_rule (@$dataflow_rules)
515

516 517
}

518

519
sub _add_table_node {
520
    my ($self, $table_node_name, $table_name) = @_;
521

522
    my $node_fontname    = $self->config_get('Node', 'Table', 'Font');
523
    my (@column_names, $columns, $table_data, $data_limit, $hit_limit);
524

525
    if( $data_limit = $self->config_get('DisplayData') ) {
526 527
        my $adaptor = $self->dba->get_NakedTableAdaptor();
        $adaptor->table_name( $table_name );
528

529 530
        @column_names = sort keys %{$adaptor->column_set};
        $columns = scalar(@column_names);
531 532 533 534 535 536
        $table_data = $adaptor->fetch_all( 'LIMIT '.($data_limit+1) );

        if(scalar(@$table_data)>$data_limit) {
            pop @$table_data;
            $hit_limit = 1;
        }
537 538 539 540 541 542 543 544 545 546
    }

    my $table_label = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.($columns||1).'">'.$table_name.'</td></tr>';

    if( $self->config_get('DisplayData') ) {
        $table_label .= '<tr><td colspan="'.$columns.'"> </td></tr>';
        $table_label .= '<tr>'.join('', map { qq{<td bgcolor="lightblue" border="1">$_</td>} } @column_names).'</tr>';
        foreach my $row (@$table_data) {
            $table_label .= '<tr>'.join('', map { qq{<td>$_</td>} } @{$row}{@column_names}).'</tr>';
        }
547 548 549
        if($hit_limit) {
            $table_label  .= qq{<tr><td colspan="$columns">[ more data ]</td></tr>};
        }
550 551 552
    }
    $table_label .= '</table>>';

553
    $self->graph()->add_node( $table_node_name, 
554 555 556 557 558
        label => $table_label,
        shape => 'record',
        fontname => $node_fontname,
        color => $self->config_get('Node', 'Table', 'Colour'),
    );
559 560
}

Leo Gordon's avatar
Leo Gordon committed
561
1;