Graph.pm 20.8 KB
Newer Older
1
=pod
2 3 4

=head1 NAME

5
    Bio::EnsEMBL::Hive::Utils::Graph
6 7 8

=head1 SYNOPSIS

9 10
    my $hive_dba = get_hive_dba();
    my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $hive_dba);
11 12
    my $graphviz = $g->build();
    $graphviz->as_png('location.png');
13 14 15

=head1 DESCRIPTION

16 17 18 19 20 21 22
    This is a module for converting a hive database's flow of analyses, control 
    rules and dataflows into the GraphViz model language. This information can
    then be converted to an image or to the dot language for further manipulation
    in GraphViz.

=head1 LICENSE

23
    Copyright [1999-2014] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
24 25 26 27 28 29 30 31 32

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

         http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software distributed under the License
    is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and limitations under the License.
33

34
=head1 CONTACT
35

36
  Please subscribe to the Hive mailing list:  http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users  to discuss Hive-related questions or to be notified of our updates
37 38 39 40 41

=head1 APPENDIX

    The rest of the documentation details each of the object methods.
    Internal methods are usually preceded with a _
42 43 44

=cut

45 46 47

package Bio::EnsEMBL::Hive::Utils::Graph;

48 49 50
use strict;
use warnings;

51
use Bio::EnsEMBL::Hive;
52
use Bio::EnsEMBL::Hive::Utils::GraphViz;
53
use Bio::EnsEMBL::Hive::Utils::Collection;
54
use Bio::EnsEMBL::Hive::Utils::Config;
55

56 57
use base ('Bio::EnsEMBL::Hive::Configurable');

58 59 60

=head2 new()

61
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $hive_dba;
62 63 64 65 66
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
67 68 69 70 71 72 73
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
74
  my ($class, $hive_dba, $config_file_name) = @_;
75

76
  my $self = bless({}, ref($class) || $class);
77

78
  $self->hive_dba($hive_dba);
79
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
80
  $self->config($config);
81
  $self->context( [ 'Graph' ] );
82

83 84 85 86 87 88 89 90 91 92 93 94 95 96
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
97 98 99 100 101 102 103
    my ($self) = @_;

    if(! exists $self->{graph}) {
        my $padding  = $self->config_get('Pad') || 0;
        $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => qq{compress"; pad = "$padding}  ); # injection hack!
    }
    return $self->{graph};
104 105 106
}


107
=head2 hive_dba()
108 109 110 111 112 113 114 115

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

116
sub hive_dba {
117 118 119
    my $self = shift @_;

    if(@_) {
120
        $self->{'hive_dba'} = shift @_;
121 122
    }

123
    return $self->{'hive_dba'};
124 125 126
}


127
sub _analysis_node_name {
128
    my ($analysis) = @_;
129

130
    return 'analysis_' . $analysis->logic_name;
131 132
}

133

134
sub _table_node_name {
135
    my ($self, $df_rule) = @_;
136

137 138
    return 'table_' . $df_rule->to_analysis->table_name .
                ($self->config_get('DuplicateTables') ?  '_'.$df_rule->from_analysis->logic_name : '');
139 140
}

141

142
sub _midpoint_name {
143
    my ($df_rule) = @_;
144

145 146 147 148 149
    if(scalar($df_rule)=~/\((\w+)\)/) {     # a unique id of a df_rule assuming dbIDs are not available
        return 'dfr_'.$1.'_mp';
    } else {
        die "Wrong argument to _midpoint_name";
    }
150 151
}

152 153 154 155 156 157 158 159 160 161 162

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
163
    my ($self) = @_;
164

165
    my $hive_dba = $self->hive_dba;
166

167 168
    if( my $job_limit = $self->config_get('DisplayJobs') and my $job_adaptor = $hive_dba && $hive_dba->get_AnalysisJobAdaptor ) {
        foreach my $analysis ( Bio::EnsEMBL::Hive->collection('Analysis')->list ) {
169 170 171 172 173
            my @jobs = sort {$a->dbID <=> $b->dbID} @{ $job_adaptor->fetch_some_by_analysis_id_limit( $analysis->dbID, $job_limit+1 )};
            $analysis->jobs_collection( \@jobs );
        }
    }

174
    foreach my $df_rule ( Bio::EnsEMBL::Hive->collection('DataflowRule')->list ) {
175

176
        if(my $target_object = Bio::EnsEMBL::Hive->collection('Analysis')->find_one_by('logic_name', $df_rule->to_analysis_url )) {
177 178 179 180 181
            $df_rule->to_analysis( $target_object );
            if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
                $target_object->{'_inflow_count'}++;
            }
        } # otherwise it may be a link out (unsupported at the moment)
182

183
        if( my $funnel_dataflow_rule  = $df_rule->funnel_dataflow_rule ) {
184
            $funnel_dataflow_rule->{'_is_a_funnel'}++;
185
        }
186 187 188 189
    }

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
190
    foreach my $source_analysis ( Bio::EnsEMBL::Hive->collection('Analysis')->list ) {
191
        unless( $source_analysis->{'_inflow_count'} ) {    # if there is no dataflow into this analysis
192
                # run the recursion in each component that has a non-cyclic start:
193
            $self->_propagate_allocation( $source_analysis );
194 195 196
        }
    }

197
    if( $self->config_get('DisplayDetails') and my $dbc = $hive_dba && $hive_dba->dbc ) {
198 199
        my $pipeline_label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
        $self->_add_pipeline_label( $pipeline_label );
200
    }
201
    foreach my $analysis ( Bio::EnsEMBL::Hive->collection('Analysis')->list ) {
202 203
        $self->_add_analysis_node($analysis);
    }
204
    foreach my $analysis ( Bio::EnsEMBL::Hive->collection('Analysis')->list ) {
205 206
        $self->_add_control_rules( $analysis->control_rules_collection );
        $self->_add_dataflow_rules( $analysis->dataflow_rules_collection );
207 208
    }

209
    if($self->config_get('DisplayStretched') ) {    # put each analysis before its' funnel midpoint
210
        foreach my $analysis ( Bio::EnsEMBL::Hive->collection('Analysis')->list ) {
211 212 213
            if($analysis->{'_funnel_dfr'}) {    # this should only affect analyses that have a funnel
                my $from = _analysis_node_name( $analysis );
                my $to   = _midpoint_name( $analysis->{'_funnel_dfr'} );
214
                $self->graph->add_edge( $from => $to,
215 216 217 218 219 220 221
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

222
    if($self->config_get('DisplaySemaphoreBoxes') ) {
223 224
        my %cluster_2_nodes = ();

225
        foreach my $analysis (Bio::EnsEMBL::Hive->collection('Analysis')->list) {
226 227
            if(my $funnel = $analysis->{'_funnel_dfr'}) {
                push @{$cluster_2_nodes{ _midpoint_name( $funnel ) } }, _analysis_node_name( $analysis );
228
            }
229 230 231 232 233 234

            foreach my $df_rule ( @{ $analysis->dataflow_rules_collection } ) {
                if( $df_rule->{'_is_a_funnel'} and ! $df_rule->{'_funnel_dfr'} ) {

                    push @{$cluster_2_nodes{ '' }}, _midpoint_name( $df_rule );     # top-level funnels define clusters (top-level "boxes")

235
                } elsif( UNIVERSAL::isa($df_rule->to_analysis, 'Bio::EnsEMBL::Hive::NakedTable') ) {
236 237 238 239 240 241 242 243 244

                    if(my $funnel = $df_rule->to_analysis->{'_funnel_dfr'}) {
                        push @{$cluster_2_nodes{ _midpoint_name( $funnel ) } }, $self->_table_node_name( $df_rule );    # table belongs to the same "box" as the dataflow source
                    }
                }

                if(my $funnel = $df_rule->{'_funnel_dfr'}) {
                    push @{$cluster_2_nodes{ _midpoint_name( $funnel ) } }, _midpoint_name( $df_rule ); # midpoints of rules that have a funnel live inside "boxes"
                }
245 246 247 248
            }
        }

        $self->graph->cluster_2_nodes( \%cluster_2_nodes );
249 250
        $self->graph->colour_scheme( $self->config_get('Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config_get('Box', 'ColourOffset') );
251 252 253 254 255 256
    }

    return $self->graph();
}


257 258
sub _propagate_allocation {
    my ($self, $source_analysis ) = @_;
259

260
    foreach my $df_rule ( @{ $source_analysis->dataflow_rules_collection } ) {    # this will only work if the analyses objects are ALL cached before loading DFRs
261
        my $target_object       = $df_rule->to_analysis();
262
        my $target_node_name;
263

264
        if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
265
            $target_node_name = _analysis_node_name( $target_object );
266
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::NakedTable')) {
267
            $target_node_name = $self->_table_node_name( $df_rule );
268
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
269
            next;
270 271 272
        } else {
            warn('Do not know how to handle the type '.ref($target_object));
            next;
273
        }
274

275 276
        my $proposed_funnel_dfr;    # will depend on whether we start a new semaphore

277
        my $funnel_dataflow_rule  = $df_rule->funnel_dataflow_rule();
278 279
        if( $funnel_dataflow_rule ) {   # if there is a new semaphore, the dfrs involved (their midpoints) will also have to be allocated
            $proposed_funnel_dfr = $funnel_dataflow_rule;       # if we do start a new semaphore, report to the new funnel (based on common funnel rule's midpoint)
280

281
            $df_rule->{'_funnel_dfr'} = $proposed_funnel_dfr;
282

283
            $funnel_dataflow_rule->{'_funnel_dfr'} = $source_analysis->{'_funnel_dfr'}; # draw the funnel's midpoint outside of the box
284
        } else {
285
            $proposed_funnel_dfr = $source_analysis->{'_funnel_dfr'} || ''; # if we don't start a new semaphore, inherit the allocation of the source
286
        }
287

288
            # we allocate on first-come basis at the moment:
289
        if( exists $target_object->{'_funnel_dfr'} ) {  # node is already allocated?
290

291 292 293 294
            my $known_funnel_dfr = $target_object->{'_funnel_dfr'};

            if( $known_funnel_dfr eq $proposed_funnel_dfr) {
                # warn "analysis '$target_node_name' has already been allocated to the same '$known_funnel_dfr' by another branch";
295
            } else {
296
                # warn "analysis '$target_node_name' has already been allocated to '$known_funnel_dfr' however this branch would allocate it to '$proposed_funnel_dfr'";
297 298
            }

299
            if($funnel_dataflow_rule) {  # correction for multiple entries into the same box (probably needs re-thinking)
300
                $df_rule->{'_funnel_dfr'} = $target_object->{'_funnel_dfr'};
301 302 303
            }

        } else {
304 305
            # warn "allocating analysis '$target_node_name' to '$proposed_funnel_dfr'";
            $target_object->{'_funnel_dfr'} = $proposed_funnel_dfr;
306

307
            if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
308
                $self->_propagate_allocation( $target_object );
309
            }
310 311
        }
    }
312 313
}

314

315 316
sub _add_pipeline_label {
    my ($self, $pipeline_label) = @_;
317

318
    my $node_fontname  = $self->config_get('Node', 'Details', 'Font');
319
    $self->graph()->add_node( 'Details',
320
        label     => $pipeline_label,
321 322
        fontname  => $node_fontname,
        shape     => 'plaintext',
323 324 325
    );
}

326

327
sub _add_analysis_node {
328
    my ($self, $analysis) = @_;
329

330
    my $analysis_stats = $analysis->stats();
331

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
    my ($breakout_label, $total_job_count, $count_hash)   = $analysis_stats->job_count_breakout();
    my $analysis_status                                   = $analysis_stats->status;
    my $analysis_status_colour                            = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Colour');
    my $style                                             = $analysis->can_be_empty() ? 'dashed, filled' : 'filled' ;
    my $node_fontname                                     = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Font');
    my $display_stats                                     = $self->config_get('DisplayStats');

    my $colspan = 0;
    my $bar_chart = '';

    if( $display_stats eq 'barchart' ) {
        foreach my $count_method (qw(SEMAPHORED READY INPROGRESS DONE FAILED)) {
            if(my $count=$count_hash->{lc($count_method).'_job_count'}) {
                $bar_chart .= '<td bgcolor="'.$self->config_get('Node', 'JobStatus', $count_method, 'Colour').'" width="'.int(100*$count/$total_job_count).'%">'.$count.lc(substr($count_method,0,1)).'</td>';
                ++$colspan;
            }
        }
        if($colspan != 1) {
            $bar_chart .= '<td>='.$total_job_count.'</td>';
            ++$colspan;
        }
    }

    $colspan ||= 1;
356
    my $analysis_label  = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.$colspan.'">'.$analysis->logic_name().' ('.($analysis->dbID || '?').')</td></tr>';
357 358 359 360 361 362 363 364 365
    if( $display_stats ) {
        $analysis_label    .= qq{<tr><td colspan="$colspan"> </td></tr>};
        if( $display_stats eq 'barchart') {
            $analysis_label    .= qq{<tr>$bar_chart</tr>};
        } elsif( $display_stats eq 'text') {
            $analysis_label    .= qq{<tr><td colspan="$colspan">$breakout_label</td></tr>};
        }
    }

366
    if( my $job_limit = $self->config_get('DisplayJobs') ) {
367
        my @jobs = @{ $analysis->jobs_collection };
368 369 370 371 372 373

        my $hit_limit;
        if(scalar(@jobs)>$job_limit) {
            pop @jobs;
            $hit_limit = 1;
        }
374 375 376 377 378 379 380 381 382 383 384

        $analysis_label    .= '<tr><td colspan="'.$colspan.'"> </td></tr>';
        foreach my $job (@jobs) {
            my $input_id = $job->input_id;
            my $status   = $job->status;
            my $job_id   = $job->dbID;
            $input_id=~s/\>/&gt;/g;
            $input_id=~s/\</&lt;/g;
            $input_id=~s/\{|\}//g;
            $analysis_label    .= qq{<tr><td colspan="$colspan" bgcolor="}.$self->config_get('Node', 'JobStatus', $status, 'Colour').qq{">$job_id [$status]: $input_id</td></tr>};
        }
385 386 387 388

        if($hit_limit) {
            $analysis_label    .= qq{<tr><td colspan="$colspan">[ and }.($total_job_count-$job_limit).qq{ more ]</td></tr>};
        }
389 390
    }
    $analysis_label    .= '</table>>';
391
  
392
    $self->graph->add_node( _analysis_node_name( $analysis ),
393 394 395 396 397 398
        label       => $analysis_label,
        shape       => 'record',
        fontname    => $node_fontname,
        style       => $style,
        fillcolor   => $analysis_status_colour,
    );
399 400 401
}


402
sub _add_control_rules {
403
  my ($self, $ctrl_rules) = @_;
404
  
405
  my $control_colour = $self->config_get('Edge', 'Control', 'Colour');
406 407
  my $graph = $self->graph();

408
      #The control rules are always from and to an analysis so no need to search for odd cases here
409 410 411
  foreach my $c_rule ( @$ctrl_rules ) {
    my $from_node_name = _analysis_node_name( $c_rule->condition_analysis );
    my $to_node_name   = _analysis_node_name( $c_rule->ctrled_analysis );
412 413

    $graph->add_edge( $from_node_name => $to_node_name,
414
      color => $control_colour,
415
      arrowhead => 'tee',
416 417
    );
  }
418 419
}

420

421
sub _add_dataflow_rules {
422
    my ($self, $dataflow_rules) = @_;
423

424
    my $graph = $self->graph();
425 426 427 428
    my $dataflow_colour     = $self->config_get('Edge', 'Data', 'Colour');
    my $semablock_colour    = $self->config_get('Edge', 'Semablock', 'Colour');
    my $accu_colour         = $self->config_get('Edge', 'Accu', 'Colour');
    my $df_edge_fontname    = $self->config_get('Edge', 'Data', 'Font');
429

430
    foreach my $df_rule ( @$dataflow_rules ) {
431
    
432 433
        my ($from_analysis, $branch_code, $funnel_dataflow_rule, $target_object) =
            ($df_rule->from_analysis, $df_rule->branch_code, $df_rule->funnel_dataflow_rule, $df_rule->to_analysis);
434
        my $from_node_name = _analysis_node_name( $from_analysis );
435
        my $target_node_name;
436
    
437
            # Different treatment for analyses and tables:
438
        if(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
439

440
            $target_node_name = _analysis_node_name( $target_object );
441

442
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::NakedTable')) {
443

444 445
            $target_node_name = $self->_table_node_name( $df_rule );
            $self->_add_table_node($target_node_name, $target_object->table_name);
446

447
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
448 449

            $target_node_name = _midpoint_name( $from_analysis->{'_funnel_dfr'} );
450

451
        } else {
452
            warn('Do not know how to handle the type '.ref($target_object));
453 454 455
            next;
        }

456 457
            # a rule needs a midpoint either if it HAS a funnel or if it IS a funnel
        if( $funnel_dataflow_rule or $df_rule->{'_is_a_funnel'} ) {
458
            my $midpoint_name = _midpoint_name( $df_rule );
459

460
            $graph->add_node( $midpoint_name,   # midpoint itself
461
                color       => $dataflow_colour,
462 463
                label       => '',
                shape       => 'point',
464 465 466
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
467
            );
468
            $graph->add_edge( $from_node_name => $midpoint_name, # first half of the two-part arrow
469
                color       => $dataflow_colour,
470
                arrowhead   => 'none',
471
                fontname    => $df_edge_fontname,
472 473
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
474
            );
475
            $graph->add_edge( $midpoint_name => $target_node_name,   # second half of the two-part arrow
476
                color     => $dataflow_colour,
477
            );
478
            if($funnel_dataflow_rule) {
479
                $graph->add_edge( $midpoint_name => _midpoint_name( $funnel_dataflow_rule ),   # semaphore inter-rule link
480
                    color     => $semablock_colour,
481 482 483 484 485 486
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
487
        } elsif(UNIVERSAL::isa($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
488
                # one-part dashed arrow:
489
            $graph->add_edge( $from_node_name => $target_node_name,
490 491
                color       => $accu_colour,
                style       => 'dashed',
492
                label       => $target_object->struct_name().'#'.$branch_code,
493 494 495 496 497
                fontname    => $df_edge_fontname,
                fontcolor   => $accu_colour,
                dir         => 'both',
                arrowtail   => 'crow',
            );
498
        } else {
499
                # one-part solid arrow:
500
            $graph->add_edge( $from_node_name => $target_node_name,
501
                color       => $dataflow_colour,
502
                fontname    => $df_edge_fontname,
503 504
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
505
            );
506
        } # /if( "$df_rule needs a midpoint" )
507
    } # /foreach my $df_rule (@$dataflow_rules)
508

509 510
}

511

512
sub _add_table_node {
513
    my ($self, $table_node_name, $table_name) = @_;
514

515
    my $node_fontname    = $self->config_get('Node', 'Table', 'Font');
516
    my (@column_names, $columns, $table_data, $data_limit, $hit_limit);
517

518
    my $hive_dba = $self->hive_dba;
519 520 521

    if( $data_limit = $self->config_get('DisplayData') and my $naked_table_adaptor = $hive_dba && $hive_dba->get_NakedTableAdaptor ) {
        $naked_table_adaptor->table_name( $table_name );
522

523
        @column_names = sort keys %{$naked_table_adaptor->column_set};
524
        $columns = scalar(@column_names);
525
        $table_data = $naked_table_adaptor->fetch_all( 'LIMIT '.($data_limit+1) );
526 527 528 529 530

        if(scalar(@$table_data)>$data_limit) {
            pop @$table_data;
            $hit_limit = 1;
        }
531 532 533 534 535 536 537 538 539 540
    }

    my $table_label = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.($columns||1).'">'.$table_name.'</td></tr>';

    if( $self->config_get('DisplayData') ) {
        $table_label .= '<tr><td colspan="'.$columns.'"> </td></tr>';
        $table_label .= '<tr>'.join('', map { qq{<td bgcolor="lightblue" border="1">$_</td>} } @column_names).'</tr>';
        foreach my $row (@$table_data) {
            $table_label .= '<tr>'.join('', map { qq{<td>$_</td>} } @{$row}{@column_names}).'</tr>';
        }
541 542 543
        if($hit_limit) {
            $table_label  .= qq{<tr><td colspan="$columns">[ more data ]</td></tr>};
        }
544 545 546
    }
    $table_label .= '</table>>';

547
    $self->graph()->add_node( $table_node_name, 
548 549 550 551 552
        label => $table_label,
        shape => 'record',
        fontname => $node_fontname,
        color => $self->config_get('Node', 'Table', 'Colour'),
    );
553 554
}

Leo Gordon's avatar
Leo Gordon committed
555
1;