Graph.pm 20.3 KB
Newer Older
1
=pod
2 3 4

=head1 NAME

5
    Bio::EnsEMBL::Hive::Utils::Graph
6 7 8

=head1 SYNOPSIS

9 10 11 12
    my $dba = get_hive_dba();
    my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $dba);
    my $graphviz = $g->build();
    $graphviz->as_png('location.png');
13 14 15

=head1 DESCRIPTION

16 17 18 19 20 21 22
    This is a module for converting a hive database's flow of analyses, control 
    rules and dataflows into the GraphViz model language. This information can
    then be converted to an image or to the dot language for further manipulation
    in GraphViz.

=head1 LICENSE

23 24
    Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
    Copyright [2016] EMBL-European Bioinformatics Institute
25 26 27 28 29 30 31 32 33

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.
34

35
=head1 CONTACT
36

37
  Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
38 39 40 41 42

=head1 APPENDIX

    The rest of the documentation details each of the object methods.
    Internal methods are usually preceded with a _
43 44 45

=cut

46 47 48

package Bio::EnsEMBL::Hive::Utils::Graph;

49 50 51
use strict;
use warnings;

52
use Bio::EnsEMBL::Hive::Utils::GraphViz;
53
use Bio::EnsEMBL::Hive::Utils::Config;
54

55 56
use base ('Bio::EnsEMBL::Hive::Configurable');

57 58 59

=head2 new()

60 61 62 63 64 65
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $dba;
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
66 67 68 69 70 71 72
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
73
  my ($class, $dba, $config_file_name) = @_;
74

75
  my $self = bless({}, ref($class) || $class);
76

77
  $self->dba($dba);
78
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
79
  $self->config($config);
80
  $self->context( [ 'Graph' ] );
81

82 83 84 85 86 87 88 89 90 91 92 93 94 95
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
96 97 98 99 100 101 102
    my ($self) = @_;

    if(! exists $self->{graph}) {
        my $padding  = $self->config_get('Pad') || 0;
        $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => qq{compress"; pad = "$padding}  ); # injection hack!
    }
    return $self->{graph};
103 104 105 106 107 108 109 110 111 112 113 114 115
}


=head2 dba()

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

sub dba {
116 117 118 119 120 121 122
    my $self = shift @_;

    if(@_) {
        $self->{dba} = shift @_;
    }

    return $self->{dba};
123 124 125
}


126 127 128 129 130 131
sub _analysis_node_name {
    my $analysis_id = shift @_;

    return 'analysis_' . $analysis_id;
}

132 133 134 135 136 137
sub _table_node_name {
    my $table_name = shift @_;

    return 'table_' . $table_name;
}

138

139 140 141 142 143 144
sub _midpoint_name {
    my $rule_id = shift @_;

    return 'dfr_'.$rule_id.'_mp';
}

145 146 147 148 149 150 151 152 153 154 155

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
156
    my ($self) = @_;
157 158 159 160 161 162 163

    my $all_analyses          = $self->dba()->get_AnalysisAdaptor()->fetch_all();
    my $all_ctrl_rules        = $self->dba()->get_AnalysisCtrlRuleAdaptor()->fetch_all();
    my $all_dataflow_rules    = $self->dba()->get_DataflowRuleAdaptor()->fetch_all();

    my %inflow_count = ();    # used to detect sources (nodes with zero inflow)
    my %outflow_rules = ();   # maps from anlaysis_node_name to a list of all dataflow rules that flow out of it
164
    my %dfr_flows_into_node = ();   # maps from dfr_id to target analysis_node_name
165 166

    foreach my $rule ( @$all_dataflow_rules ) {
167 168 169
        my $target_object = $rule->to_analysis;
        if(my $to_id = $target_object->can('dbID') && $target_object->dbID()) {
            my $to_node_name = _analysis_node_name( $to_id );
170
            $inflow_count{$to_node_name}++;
171
            $dfr_flows_into_node{$rule->dbID()} = $to_node_name;
172 173 174 175 176 177 178 179
        }
        push @{$outflow_rules{ _analysis_node_name($rule->from_analysis_id()) }}, $rule;
    }

    my %subgraph_allocation = ();

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
180 181 182
    foreach my $source_analysis_node_name ( map { _analysis_node_name( $_->dbID ) } @$all_analyses ) {
        unless($inflow_count{$source_analysis_node_name}) {    # if there is no dataflow into this analysis
            $self->_allocate_to_subgraph(\%outflow_rules, \%dfr_flows_into_node, $source_analysis_node_name, \%subgraph_allocation ); # run the recursion in each component that has a non-cyclic start
183 184 185 186 187 188 189 190
        }
    }

    $self->_add_hive_details();
    foreach my $a (@$all_analyses) {
        $self->_add_analysis_node($a);
    }
    $self->_control_rules( $all_ctrl_rules );
191
    $self->_dataflow_rules( $all_dataflow_rules, \%subgraph_allocation );
192

193
    if($self->config_get('DisplayStretched') ) {
194 195 196 197 198 199

        # The invisible edges will be linked to the destination analysis instead of the midpoint
        my $id_to_rule = {map { $_->dbID => $_ } @$all_dataflow_rules};
        my @all_fdr_id = grep {$_} (map {$_->funnel_dataflow_rule_id} @$all_dataflow_rules);
        my $midpoint_to_analysis = {map { _midpoint_name( $_ ) => _analysis_node_name( $id_to_rule->{$_}->to_analysis->dbID ) } @all_fdr_id};

200
        while( my($from, $to) = each %subgraph_allocation) {
201 202
            if($to && $from=~/^analysis/) {
                $self->graph->add_edge( $from => $to,
203 204 205 206 207 208 209
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

210
    if($self->config_get('DisplaySemaphoreBoxes') ) {
211
        $self->graph->subgraphs( \%subgraph_allocation );
212 213
        $self->graph->colour_scheme( $self->config_get('Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config_get('Box', 'ColourOffset') );
214 215 216 217 218 219 220
    }

    return $self->graph();
}


sub _allocate_to_subgraph {
221
    my ($self, $outflow_rules, $dfr_flows_into_node, $source_analysis_node_name, $subgraph_allocation ) = @_;
222

223
    my $source_analysis_allocation = $subgraph_allocation->{ $source_analysis_node_name };  # for some analyses it will be undef
224

225 226 227
    foreach my $rule ( @{ $outflow_rules->{$source_analysis_node_name} } ) {
        my $target_object                 = $rule->to_analysis();
        my $target_node_name;
228

229
        if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
230
            $target_node_name = _analysis_node_name( $rule->to_analysis->dbID() );
231
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::NakedTable')) {
232 233
            $target_node_name = _table_node_name($target_object->table_name()) . '_' .
                ($self->config_get('DuplicateTables') ?  $rule->from_analysis_id() : ($source_analysis_allocation||''));
234
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
235
            next;
236 237 238
        } else {
            warn('Do not know how to handle the type '.ref($target_object));
            next;
239
        }
240

241 242 243 244
        my $proposed_allocation;    # will depend on whether we start a new semaphore
        my $funnel_dataflow_rule_id  = $rule->funnel_dataflow_rule_id();
        if( $funnel_dataflow_rule_id ) {
            $proposed_allocation =
245 246
#                $dfr_flows_into_node->{$funnel_dataflow_rule_id};   # if we do start a new semaphore, report to the new funnel (based on common funnel's analysis name)
                _midpoint_name( $funnel_dataflow_rule_id );       # if we do start a new semaphore, report to the new funnel (based on common funnel rule's midpoint)
247 248 249 250 251

            my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
            $subgraph_allocation->{ $fan_midpoint_name } = $proposed_allocation;

            my $funnel_midpoint_name = _midpoint_name( $funnel_dataflow_rule_id );
252 253 254
            $subgraph_allocation->{ $funnel_midpoint_name } = $source_analysis_allocation;   # draw the funnel's midpoint outside of the box
        } else {
            $proposed_allocation = $source_analysis_allocation;   # if we don't start a new semaphore, inherit the allocation of the source
255
        }
256 257 258
            # we allocate on first-come basis at the moment:
        if( exists $subgraph_allocation->{ $target_node_name } ) {  # already allocated?
            my $known_allocation = $subgraph_allocation->{ $target_node_name } || '';
259 260 261
            $proposed_allocation ||= '';

            if( $known_allocation eq $proposed_allocation) {
262
                # warn "analysis '$target_node_name' has already been allocated to the same '$known_allocation' by another branch";
263
            } else {
264 265 266 267 268 269
                # warn "analysis '$target_node_name' has already been allocated to '$known_allocation' however this branch would allocate it to '$proposed_allocation'";
            }

            if($funnel_dataflow_rule_id) {  # correction for multiple entries into the same box (probably needs re-thinking)
                my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
                $subgraph_allocation->{ $fan_midpoint_name } = $subgraph_allocation->{ $target_node_name };
270 271 272
            }

        } else {
273 274
            # warn "allocating analysis '$target_node_name' to '$proposed_allocation'";
            $subgraph_allocation->{ $target_node_name } = $proposed_allocation;
275

276
            $self->_allocate_to_subgraph( $outflow_rules, $dfr_flows_into_node, $target_node_name, $subgraph_allocation );
277 278
        }
    }
279 280
}

281

282 283
sub _add_hive_details {
  my ($self) = @_;
284

285
  my $node_fontname  = $self->config_get('Node', 'Details', 'Font');
286

287
  if( $self->config_get('DisplayDetails') ) {
288
    my $dbc = $self->dba()->dbc();
289
    my $label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
290
    $self->graph()->add_node( 'Details',
291 292 293
      label     => $label,
      fontname  => $node_fontname,
      shape     => 'plaintext',
294 295 296 297
    );
  }
}

298

299
sub _add_analysis_node {
300
    my ($self, $analysis) = @_;
301

302
    my $analysis_stats = $analysis->stats();
303

304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
    my ($breakout_label, $total_job_count, $count_hash)   = $analysis_stats->job_count_breakout();
    my $analysis_status                                   = $analysis_stats->status;
    my $analysis_status_colour                            = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Colour');
    my $style                                             = $analysis->can_be_empty() ? 'dashed, filled' : 'filled' ;
    my $node_fontname                                     = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Font');
    my $display_stats                                     = $self->config_get('DisplayStats');

    my $colspan = 0;
    my $bar_chart = '';

    if( $display_stats eq 'barchart' ) {
        foreach my $count_method (qw(SEMAPHORED READY INPROGRESS DONE FAILED)) {
            if(my $count=$count_hash->{lc($count_method).'_job_count'}) {
                $bar_chart .= '<td bgcolor="'.$self->config_get('Node', 'JobStatus', $count_method, 'Colour').'" width="'.int(100*$count/$total_job_count).'%">'.$count.lc(substr($count_method,0,1)).'</td>';
                ++$colspan;
            }
        }
        if($colspan != 1) {
            $bar_chart .= '<td>='.$total_job_count.'</td>';
            ++$colspan;
        }
    }

    $colspan ||= 1;
    my $analysis_label  = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.$colspan.'">'.$analysis->logic_name().' ('.$analysis->dbID().')</td></tr>';
    if( $display_stats ) {
        $analysis_label    .= qq{<tr><td colspan="$colspan"> </td></tr>};
        if( $display_stats eq 'barchart') {
            $analysis_label    .= qq{<tr>$bar_chart</tr>};
        } elsif( $display_stats eq 'text') {
            $analysis_label    .= qq{<tr><td colspan="$colspan">$breakout_label</td></tr>};
        }
    }

338
    if( my $job_limit = $self->config_get('DisplayJobs') ) {
339
        my $adaptor = $self->dba->get_AnalysisJobAdaptor();
340 341 342 343 344 345 346
        my @jobs = sort {$a->dbID <=> $b->dbID} @{ $adaptor->fetch_some_by_analysis_id_limit( $analysis->dbID, $job_limit+1 )};

        my $hit_limit;
        if(scalar(@jobs)>$job_limit) {
            pop @jobs;
            $hit_limit = 1;
        }
347 348 349 350 351 352 353 354 355 356 357

        $analysis_label    .= '<tr><td colspan="'.$colspan.'"> </td></tr>';
        foreach my $job (@jobs) {
            my $input_id = $job->input_id;
            my $status   = $job->status;
            my $job_id   = $job->dbID;
            $input_id=~s/\>/&gt;/g;
            $input_id=~s/\</&lt;/g;
            $input_id=~s/\{|\}//g;
            $analysis_label    .= qq{<tr><td colspan="$colspan" bgcolor="}.$self->config_get('Node', 'JobStatus', $status, 'Colour').qq{">$job_id [$status]: $input_id</td></tr>};
        }
358 359 360 361

        if($hit_limit) {
            $analysis_label    .= qq{<tr><td colspan="$colspan">[ and }.($total_job_count-$job_limit).qq{ more ]</td></tr>};
        }
362 363
    }
    $analysis_label    .= '</table>>';
364
  
365 366 367 368 369 370 371
    $self->graph->add_node( _analysis_node_name( $analysis->dbID() ), 
        label       => $analysis_label,
        shape       => 'record',
        fontname    => $node_fontname,
        style       => $style,
        fillcolor   => $analysis_status_colour,
    );
372 373 374 375
}


sub _control_rules {
376
  my ($self, $all_ctrl_rules) = @_;
377
  
378
  my $control_colour = $self->config_get('Edge', 'Control', 'Colour');
379 380 381
  my $graph = $self->graph();

  #The control rules are always from and to an analysis so no need to search for odd cases here
382 383 384
  foreach my $rule ( @$all_ctrl_rules ) {
    my ($from, $to) = ( _analysis_node_name( $rule->condition_analysis()->dbID() ), _analysis_node_name( $rule->ctrled_analysis()->dbID() ) );
    $graph->add_edge( $from => $to, 
385
      color => $control_colour,
386
      arrowhead => 'tee',
387 388
    );
  }
389 390
}

391

392
sub _dataflow_rules {
393
    my ($self, $all_dataflow_rules, $subgraph_allocation) = @_;
394

395
    my $graph = $self->graph();
396 397 398 399
    my $dataflow_colour     = $self->config_get('Edge', 'Data', 'Colour');
    my $semablock_colour    = $self->config_get('Edge', 'Semablock', 'Colour');
    my $accu_colour         = $self->config_get('Edge', 'Accu', 'Colour');
    my $df_edge_fontname    = $self->config_get('Edge', 'Data', 'Font');
400 401

    my %needs_a_midpoint = ();
402 403
    my %aid2aid_nonsem = ();    # simply a directed graph between numerical analysis_ids, except for semaphored rules
    foreach my $rule ( @$all_dataflow_rules ) {
404
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
405 406 407
            unless( $rule->funnel_dataflow_rule_id ) {
                $aid2aid_nonsem{$rule->from_analysis_id()}{$to_id}++;
            }
408 409 410 411 412 413
        }
        if(my $funnel_dataflow_rule_id = $rule->funnel_dataflow_rule_id()) {
            $needs_a_midpoint{$rule->dbID()}++;
            $needs_a_midpoint{$funnel_dataflow_rule_id}++;
        }
    }
414

415
    foreach my $rule ( @$all_dataflow_rules ) {
416
    
417 418
        my ($rule_id, $from_analysis_id, $branch_code, $funnel_dataflow_rule_id, $to) =
            ($rule->dbID(), $rule->from_analysis_id(), $rule->branch_code(), $rule->funnel_dataflow_rule_id(), $rule->to_analysis());
419
        my ($from_node, $to_id, $to_node) = ( _analysis_node_name($from_analysis_id)      );
420
    
421
            # Different treatment for analyses and tables:
422
        if(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::Analysis')) {
423 424
            $to_id   = $to->dbID();
            $to_node = _analysis_node_name($to_id);
425
        } elsif(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::NakedTable')) {
426 427 428 429

            $to_node = _table_node_name($to->table_name) . '_' .
                ( $self->config_get('DuplicateTables') ? $rule->from_analysis_id() : ($subgraph_allocation->{$from_node}||''));

430
            $self->_add_table_node($to_node, $to->table_name);
431
        } elsif(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::Accumulator')) {
432 433
            $to_node = $subgraph_allocation->{$from_node};

434 435 436 437 438 439 440
        } else {
            warn('Do not know how to handle the type '.ref($to));
            next;
        }

        if($needs_a_midpoint{$rule_id}) {
            my $midpoint_name = _midpoint_name($rule_id);
441

442
            $graph->add_node( $midpoint_name,   # midpoint itself
443
                color       => $dataflow_colour,
444 445
                label       => '',
                shape       => 'point',
446 447 448
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
449
            );
450
            $graph->add_edge( $from_node => $midpoint_name, # first half of the two-part arrow
451
                color       => $dataflow_colour,
452
                arrowhead   => 'none',
453
                fontname    => $df_edge_fontname,
454 455
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
456
            );
457
            $graph->add_edge( $midpoint_name => $to_node,   # second half of the two-part arrow
458
                color     => $dataflow_colour,
459 460
            );
            if($funnel_dataflow_rule_id) {
461
                $graph->add_edge( $midpoint_name => _midpoint_name($funnel_dataflow_rule_id),   # semaphore inter-rule link
462
                    color     => $semablock_colour,
463 464 465 466 467 468
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
469
        } elsif(UNIVERSAL::isa($to, 'Bio::EnsEMBL::Hive::Accumulator')) {
470 471 472 473 474 475 476 477 478 479
                # one-part dashed arrow:
            $graph->add_edge( $from_node => $to_node,
                color       => $accu_colour,
                style       => 'dashed',
                label       => $to->struct_name().'#'.$branch_code,
                fontname    => $df_edge_fontname,
                fontcolor   => $accu_colour,
                dir         => 'both',
                arrowtail   => 'crow',
            );
480
        } else {
481
                # one-part solid arrow:
482
            $graph->add_edge( $from_node => $to_node, 
483
                color       => $dataflow_colour,
484
                fontname    => $df_edge_fontname,
485 486
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
487
            );
488 489 490
        } # /if($needs_a_midpoint{$rule_id})
    } # /foreach my $rule (@$all_dataflow_rules)

491 492
}

493

494
sub _add_table_node {
495
    my ($self, $table_node, $table_name) = @_;
496

497
    my $node_fontname    = $self->config_get('Node', 'Table', 'Font');
498
    my (@column_names, $columns, $table_data, $data_limit, $hit_limit);
499

500
    if( $data_limit = $self->config_get('DisplayData') ) {
501 502
        my $adaptor = $self->dba->get_NakedTableAdaptor();
        $adaptor->table_name( $table_name );
503

504 505
        @column_names = sort keys %{$adaptor->column_set};
        $columns = scalar(@column_names);
506 507 508 509 510 511
        $table_data = $adaptor->fetch_all( 'LIMIT '.($data_limit+1) );

        if(scalar(@$table_data)>$data_limit) {
            pop @$table_data;
            $hit_limit = 1;
        }
512 513 514 515 516 517 518 519 520 521
    }

    my $table_label = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.($columns||1).'">'.$table_name.'</td></tr>';

    if( $self->config_get('DisplayData') ) {
        $table_label .= '<tr><td colspan="'.$columns.'"> </td></tr>';
        $table_label .= '<tr>'.join('', map { qq{<td bgcolor="lightblue" border="1">$_</td>} } @column_names).'</tr>';
        foreach my $row (@$table_data) {
            $table_label .= '<tr>'.join('', map { qq{<td>$_</td>} } @{$row}{@column_names}).'</tr>';
        }
522 523 524
        if($hit_limit) {
            $table_label  .= qq{<tr><td colspan="$columns">[ more data ]</td></tr>};
        }
525 526 527 528 529 530 531 532 533
    }
    $table_label .= '</table>>';

    $self->graph()->add_node( $table_node, 
        label => $table_label,
        shape => 'record',
        fontname => $node_fontname,
        color => $self->config_get('Node', 'Table', 'Colour'),
    );
534 535
}

Leo Gordon's avatar
Leo Gordon committed
536
1;