Graph.pm 19.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
package Bio::EnsEMBL::Hive::Utils::Graph;

=head1 NAME

Bio::EnsEMBL::Hive::Utils::Graph

=head1 SYNOPSIS

  my $dba = get_hive_dba();
  my $g = Bio::EnsEMBL::Hive::Utils::Graph->new(-DBA => $dba);
  my $graphviz = $g->build();
  $graphviz->as_png('location.png');

=head1 DESCRIPTION

This is a module for converting a hive database's flow of analyses, control 
rules and dataflows into the GraphViz model language. This information can
then be converted to an image or to the dot language for further manipulation
in GraphViz.

=head1 METHODS/SUBROUTINES

See inline

=cut

use strict;
use warnings;

use Bio::EnsEMBL::Utils::Scalar qw(check_ref assert_ref);

32
use Bio::EnsEMBL::Hive::Utils::GraphViz;
33
use Bio::EnsEMBL::Hive::Utils::Config;
34

35 36
use base ('Bio::EnsEMBL::Hive::Configurable');

37 38 39

=head2 new()

40 41 42 43 44 45
  Arg [1] : Bio::EnsEMBL::Hive::DBSQL::DBAdaptor $dba;
              The adaptor to get information from
  Arg [2] : (optional) string $config_file_name;
                  A JSON file name to initialize the Config object with.
                  If one is not given then we don't pass anything into Config's constructor,
                  which results in loading configuration from Config's standard locations.
46 47 48 49 50 51 52
  Returntype : Graph object
  Exceptions : If the parameters are not as required
  Status     : Beta
  
=cut

sub new {
53
  my ($class, $dba, $config_file_name) = @_;
54

55
  my $self = bless({}, ref($class) || $class);
56

57
  $self->dba($dba);
58
  my $config = Bio::EnsEMBL::Hive::Utils::Config->new( $config_file_name ? $config_file_name : () );
59
  $self->config($config);
60
  $self->context( [ 'Graph' ] );
61

62 63 64 65 66 67 68 69 70 71 72 73 74 75
  return $self;
}


=head2 graph()

  Arg [1] : The GraphViz instance created by this module
  Returntype : GraphViz
  Exceptions : None
  Status     : Beta

=cut

sub graph {
76 77 78 79 80 81 82
    my ($self) = @_;

    if(! exists $self->{graph}) {
        my $padding  = $self->config_get('Pad') || 0;
        $self->{graph} = Bio::EnsEMBL::Hive::Utils::GraphViz->new( name => 'AnalysisWorkflow', ratio => qq{compress"; pad = "$padding}  ); # injection hack!
    }
    return $self->{graph};
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
}


=head2 dba()

  Arg [1] : The DBAdaptor instance
  Returntype : DBAdaptor
  Exceptions : If the given object is not a hive DBAdaptor
  Status     : Beta

=cut

sub dba {
  my ($self, $dba) = @_;
  if(defined $dba) {
    assert_ref($dba, 'Bio::EnsEMBL::Hive::DBSQL::DBAdaptor');
    $self->{dba} = $dba;
  }
  return $self->{dba};
}


105 106 107 108 109 110
sub _analysis_node_name {
    my $analysis_id = shift @_;

    return 'analysis_' . $analysis_id;
}

111 112 113 114 115 116
sub _table_node_name {
    my $table_name = shift @_;

    return 'table_' . $table_name;
}

117

118 119 120 121 122 123
sub _midpoint_name {
    my $rule_id = shift @_;

    return 'dfr_'.$rule_id.'_mp';
}

124 125 126 127 128 129 130 131 132 133 134

=head2 build()

  Returntype : The GraphViz object built & populated
  Exceptions : Raised if there are issues with accessing the database
  Description : Builds the graph object and returns it.
  Status     : Beta

=cut

sub build {
135
    my ($self) = @_;
136 137 138 139 140 141 142

    my $all_analyses          = $self->dba()->get_AnalysisAdaptor()->fetch_all();
    my $all_ctrl_rules        = $self->dba()->get_AnalysisCtrlRuleAdaptor()->fetch_all();
    my $all_dataflow_rules    = $self->dba()->get_DataflowRuleAdaptor()->fetch_all();

    my %inflow_count = ();    # used to detect sources (nodes with zero inflow)
    my %outflow_rules = ();   # maps from anlaysis_node_name to a list of all dataflow rules that flow out of it
143
    my %dfr_flows_into_node = ();   # maps from dfr_id to target analysis_node_name
144 145

    foreach my $rule ( @$all_dataflow_rules ) {
146 147 148
        my $target_object = $rule->to_analysis;
        if(my $to_id = $target_object->can('dbID') && $target_object->dbID()) {
            my $to_node_name = _analysis_node_name( $to_id );
149
            $inflow_count{$to_node_name}++;
150
            $dfr_flows_into_node{$rule->dbID()} = $to_node_name;
151 152 153 154 155 156 157 158
        }
        push @{$outflow_rules{ _analysis_node_name($rule->from_analysis_id()) }}, $rule;
    }

    my %subgraph_allocation = ();

        # NB: this is a very approximate algorithm with rough edges!
        # It will not find all start nodes in cyclic components!
159 160 161
    foreach my $source_analysis_node_name ( map { _analysis_node_name( $_->dbID ) } @$all_analyses ) {
        unless($inflow_count{$source_analysis_node_name}) {    # if there is no dataflow into this analysis
            $self->_allocate_to_subgraph(\%outflow_rules, \%dfr_flows_into_node, $source_analysis_node_name, \%subgraph_allocation ); # run the recursion in each component that has a non-cyclic start
162 163 164 165 166 167 168 169
        }
    }

    $self->_add_hive_details();
    foreach my $a (@$all_analyses) {
        $self->_add_analysis_node($a);
    }
    $self->_control_rules( $all_ctrl_rules );
170
    $self->_dataflow_rules( $all_dataflow_rules, \%subgraph_allocation );
171

172
    if($self->config_get('DisplayStretched') ) {
173 174 175 176 177 178

        # The invisible edges will be linked to the destination analysis instead of the midpoint
        my $id_to_rule = {map { $_->dbID => $_ } @$all_dataflow_rules};
        my @all_fdr_id = grep {$_} (map {$_->funnel_dataflow_rule_id} @$all_dataflow_rules);
        my $midpoint_to_analysis = {map { _midpoint_name( $_ ) => _analysis_node_name( $id_to_rule->{$_}->to_analysis->dbID ) } @all_fdr_id};

179
        while( my($from, $to) = each %subgraph_allocation) {
180 181
            if($to && $from=~/^analysis/) {
                $self->graph->add_edge( $from => $to,
182 183 184 185 186 187 188
                    color     => 'black',
                    style     => 'invis',   # toggle visibility by changing 'invis' to 'dashed'
                );
            }
        }
    }

189
    if($self->config_get('DisplaySemaphoreBoxes') ) {
190
        $self->graph->subgraphs( \%subgraph_allocation );
191 192
        $self->graph->colour_scheme( $self->config_get('Box', 'ColourScheme') );
        $self->graph->colour_offset( $self->config_get('Box', 'ColourOffset') );
193 194 195 196 197 198 199
    }

    return $self->graph();
}


sub _allocate_to_subgraph {
200
    my ($self, $outflow_rules, $dfr_flows_into_node, $source_analysis_node_name, $subgraph_allocation ) = @_;
201

202
    my $source_analysis_allocation = $subgraph_allocation->{ $source_analysis_node_name };  # for some analyses it will be undef
203

204 205 206
    foreach my $rule ( @{ $outflow_rules->{$source_analysis_node_name} } ) {
        my $target_object                 = $rule->to_analysis();
        my $target_node_name;
207

208
        if(check_ref($target_object, 'Bio::EnsEMBL::Hive::Analysis')) {
209
            $target_node_name = _analysis_node_name( $rule->to_analysis->dbID() );
210
        } elsif(check_ref($target_object, 'Bio::EnsEMBL::Hive::NakedTable')) {
211 212
            $target_node_name = _table_node_name($target_object->table_name()) . '_' .
                ($self->config_get('DuplicateTables') ?  $rule->from_analysis_id() : ($source_analysis_allocation||''));
213 214
        } elsif(check_ref($target_object, 'Bio::EnsEMBL::Hive::Accumulator')) {
            next;
215 216 217
        } else {
            warn('Do not know how to handle the type '.ref($target_object));
            next;
218
        }
219

220 221 222 223
        my $proposed_allocation;    # will depend on whether we start a new semaphore
        my $funnel_dataflow_rule_id  = $rule->funnel_dataflow_rule_id();
        if( $funnel_dataflow_rule_id ) {
            $proposed_allocation =
224 225
#                $dfr_flows_into_node->{$funnel_dataflow_rule_id};   # if we do start a new semaphore, report to the new funnel (based on common funnel's analysis name)
                _midpoint_name( $funnel_dataflow_rule_id );       # if we do start a new semaphore, report to the new funnel (based on common funnel rule's midpoint)
226 227 228 229 230

            my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
            $subgraph_allocation->{ $fan_midpoint_name } = $proposed_allocation;

            my $funnel_midpoint_name = _midpoint_name( $funnel_dataflow_rule_id );
231 232 233
            $subgraph_allocation->{ $funnel_midpoint_name } = $source_analysis_allocation;   # draw the funnel's midpoint outside of the box
        } else {
            $proposed_allocation = $source_analysis_allocation;   # if we don't start a new semaphore, inherit the allocation of the source
234
        }
235 236 237
            # we allocate on first-come basis at the moment:
        if( exists $subgraph_allocation->{ $target_node_name } ) {  # already allocated?
            my $known_allocation = $subgraph_allocation->{ $target_node_name } || '';
238 239 240
            $proposed_allocation ||= '';

            if( $known_allocation eq $proposed_allocation) {
241
                # warn "analysis '$target_node_name' has already been allocated to the same '$known_allocation' by another branch";
242
            } else {
243 244 245 246 247 248
                # warn "analysis '$target_node_name' has already been allocated to '$known_allocation' however this branch would allocate it to '$proposed_allocation'";
            }

            if($funnel_dataflow_rule_id) {  # correction for multiple entries into the same box (probably needs re-thinking)
                my $fan_midpoint_name = _midpoint_name( $rule->dbID() );
                $subgraph_allocation->{ $fan_midpoint_name } = $subgraph_allocation->{ $target_node_name };
249 250 251
            }

        } else {
252 253
            # warn "allocating analysis '$target_node_name' to '$proposed_allocation'";
            $subgraph_allocation->{ $target_node_name } = $proposed_allocation;
254

255
            $self->_allocate_to_subgraph( $outflow_rules, $dfr_flows_into_node, $target_node_name, $subgraph_allocation );
256 257
        }
    }
258 259
}

260

261 262
sub _add_hive_details {
  my ($self) = @_;
263

264
  my $node_fontname  = $self->config_get('Node', 'Details', 'Font');
265

266
  if( $self->config_get('DisplayDetails') ) {
267
    my $dbc = $self->dba()->dbc();
268
    my $label = sprintf('%s@%s', $dbc->dbname, $dbc->host || '-');
269
    $self->graph()->add_node( 'Details',
270 271 272
      label     => $label,
      fontname  => $node_fontname,
      shape     => 'plaintext',
273 274 275 276
    );
  }
}

277

278
sub _add_analysis_node {
279
    my ($self, $analysis) = @_;
280

281
    my $analysis_stats = $analysis->stats();
282

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
    my ($breakout_label, $total_job_count, $count_hash)   = $analysis_stats->job_count_breakout();
    my $analysis_status                                   = $analysis_stats->status;
    my $analysis_status_colour                            = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Colour');
    my $style                                             = $analysis->can_be_empty() ? 'dashed, filled' : 'filled' ;
    my $node_fontname                                     = $self->config_get('Node', 'AnalysisStatus', $analysis_status, 'Font');
    my $display_stats                                     = $self->config_get('DisplayStats');

    my $colspan = 0;
    my $bar_chart = '';

    if( $display_stats eq 'barchart' ) {
        foreach my $count_method (qw(SEMAPHORED READY INPROGRESS DONE FAILED)) {
            if(my $count=$count_hash->{lc($count_method).'_job_count'}) {
                $bar_chart .= '<td bgcolor="'.$self->config_get('Node', 'JobStatus', $count_method, 'Colour').'" width="'.int(100*$count/$total_job_count).'%">'.$count.lc(substr($count_method,0,1)).'</td>';
                ++$colspan;
            }
        }
        if($colspan != 1) {
            $bar_chart .= '<td>='.$total_job_count.'</td>';
            ++$colspan;
        }
    }

    $colspan ||= 1;
    my $analysis_label  = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.$colspan.'">'.$analysis->logic_name().' ('.$analysis->dbID().')</td></tr>';
    if( $display_stats ) {
        $analysis_label    .= qq{<tr><td colspan="$colspan"> </td></tr>};
        if( $display_stats eq 'barchart') {
            $analysis_label    .= qq{<tr>$bar_chart</tr>};
        } elsif( $display_stats eq 'text') {
            $analysis_label    .= qq{<tr><td colspan="$colspan">$breakout_label</td></tr>};
        }
    }

317
    if( my $job_limit = $self->config_get('DisplayJobs') ) {
318
        my $adaptor = $self->dba->get_AnalysisJobAdaptor();
319 320 321 322 323 324 325
        my @jobs = sort {$a->dbID <=> $b->dbID} @{ $adaptor->fetch_some_by_analysis_id_limit( $analysis->dbID, $job_limit+1 )};

        my $hit_limit;
        if(scalar(@jobs)>$job_limit) {
            pop @jobs;
            $hit_limit = 1;
        }
326 327 328 329 330 331 332 333 334 335 336

        $analysis_label    .= '<tr><td colspan="'.$colspan.'"> </td></tr>';
        foreach my $job (@jobs) {
            my $input_id = $job->input_id;
            my $status   = $job->status;
            my $job_id   = $job->dbID;
            $input_id=~s/\>/&gt;/g;
            $input_id=~s/\</&lt;/g;
            $input_id=~s/\{|\}//g;
            $analysis_label    .= qq{<tr><td colspan="$colspan" bgcolor="}.$self->config_get('Node', 'JobStatus', $status, 'Colour').qq{">$job_id [$status]: $input_id</td></tr>};
        }
337 338 339 340

        if($hit_limit) {
            $analysis_label    .= qq{<tr><td colspan="$colspan">[ and }.($total_job_count-$job_limit).qq{ more ]</td></tr>};
        }
341 342
    }
    $analysis_label    .= '</table>>';
343
  
344 345 346 347 348 349 350
    $self->graph->add_node( _analysis_node_name( $analysis->dbID() ), 
        label       => $analysis_label,
        shape       => 'record',
        fontname    => $node_fontname,
        style       => $style,
        fillcolor   => $analysis_status_colour,
    );
351 352 353 354
}


sub _control_rules {
355
  my ($self, $all_ctrl_rules) = @_;
356
  
357
  my $control_colour = $self->config_get('Edge', 'Control', 'Colour');
358 359 360
  my $graph = $self->graph();

  #The control rules are always from and to an analysis so no need to search for odd cases here
361 362 363
  foreach my $rule ( @$all_ctrl_rules ) {
    my ($from, $to) = ( _analysis_node_name( $rule->condition_analysis()->dbID() ), _analysis_node_name( $rule->ctrled_analysis()->dbID() ) );
    $graph->add_edge( $from => $to, 
364
      color => $control_colour,
365
      arrowhead => 'tee',
366 367
    );
  }
368 369
}

370

371
sub _dataflow_rules {
372
    my ($self, $all_dataflow_rules, $subgraph_allocation) = @_;
373

374
    my $graph = $self->graph();
375 376 377 378
    my $dataflow_colour     = $self->config_get('Edge', 'Data', 'Colour');
    my $semablock_colour    = $self->config_get('Edge', 'Semablock', 'Colour');
    my $accu_colour         = $self->config_get('Edge', 'Accu', 'Colour');
    my $df_edge_fontname    = $self->config_get('Edge', 'Data', 'Font');
379 380

    my %needs_a_midpoint = ();
381 382
    my %aid2aid_nonsem = ();    # simply a directed graph between numerical analysis_ids, except for semaphored rules
    foreach my $rule ( @$all_dataflow_rules ) {
383
        if(my $to_id = $rule->to_analysis->can('dbID') && $rule->to_analysis->dbID()) {
384 385 386
            unless( $rule->funnel_dataflow_rule_id ) {
                $aid2aid_nonsem{$rule->from_analysis_id()}{$to_id}++;
            }
387 388 389 390 391 392
        }
        if(my $funnel_dataflow_rule_id = $rule->funnel_dataflow_rule_id()) {
            $needs_a_midpoint{$rule->dbID()}++;
            $needs_a_midpoint{$funnel_dataflow_rule_id}++;
        }
    }
393

394
    foreach my $rule ( @$all_dataflow_rules ) {
395
    
396 397
        my ($rule_id, $from_analysis_id, $branch_code, $funnel_dataflow_rule_id, $to) =
            ($rule->dbID(), $rule->from_analysis_id(), $rule->branch_code(), $rule->funnel_dataflow_rule_id(), $rule->to_analysis());
398
        my ($from_node, $to_id, $to_node) = ( _analysis_node_name($from_analysis_id)      );
399
    
400
            # Different treatment for analyses and tables:
401
        if(check_ref($to, 'Bio::EnsEMBL::Hive::Analysis')) {
402 403
            $to_id   = $to->dbID();
            $to_node = _analysis_node_name($to_id);
404
        } elsif(check_ref($to, 'Bio::EnsEMBL::Hive::NakedTable')) {
405 406 407 408

            $to_node = _table_node_name($to->table_name) . '_' .
                ( $self->config_get('DuplicateTables') ? $rule->from_analysis_id() : ($subgraph_allocation->{$from_node}||''));

409
            $self->_add_table_node($to_node, $to->table_name);
410 411 412
        } elsif(check_ref($to, 'Bio::EnsEMBL::Hive::Accumulator')) {
            $to_node = $subgraph_allocation->{$from_node};

413 414 415 416 417 418 419
        } else {
            warn('Do not know how to handle the type '.ref($to));
            next;
        }

        if($needs_a_midpoint{$rule_id}) {
            my $midpoint_name = _midpoint_name($rule_id);
420

421
            $graph->add_node( $midpoint_name,   # midpoint itself
422
                color       => $dataflow_colour,
423 424
                label       => '',
                shape       => 'point',
425 426 427
                fixedsize   => 1,
                width       => 0.01,
                height      => 0.01,
428
            );
429
            $graph->add_edge( $from_node => $midpoint_name, # first half of the two-part arrow
430
                color       => $dataflow_colour,
431
                arrowhead   => 'none',
432
                fontname    => $df_edge_fontname,
433 434
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
435
            );
436
            $graph->add_edge( $midpoint_name => $to_node,   # second half of the two-part arrow
437
                color     => $dataflow_colour,
438 439
            );
            if($funnel_dataflow_rule_id) {
440
                $graph->add_edge( $midpoint_name => _midpoint_name($funnel_dataflow_rule_id),   # semaphore inter-rule link
441
                    color     => $semablock_colour,
442 443 444 445 446 447
                    style     => 'dashed',
                    arrowhead => 'tee',
                    dir       => 'both',
                    arrowtail => 'crow',
                );
            }
448 449 450 451 452 453 454 455 456 457 458
        } elsif(check_ref($to, 'Bio::EnsEMBL::Hive::Accumulator')) {
                # one-part dashed arrow:
            $graph->add_edge( $from_node => $to_node,
                color       => $accu_colour,
                style       => 'dashed',
                label       => $to->struct_name().'#'.$branch_code,
                fontname    => $df_edge_fontname,
                fontcolor   => $accu_colour,
                dir         => 'both',
                arrowtail   => 'crow',
            );
459
        } else {
460
                # one-part solid arrow:
461
            $graph->add_edge( $from_node => $to_node, 
462
                color       => $dataflow_colour,
463
                fontname    => $df_edge_fontname,
464 465
                fontcolor   => $dataflow_colour,
                label       => '#'.$branch_code,
466
            );
467 468 469
        } # /if($needs_a_midpoint{$rule_id})
    } # /foreach my $rule (@$all_dataflow_rules)

470 471
}

472

473
sub _add_table_node {
474
    my ($self, $table_node, $table_name) = @_;
475

476
    my $node_fontname    = $self->config_get('Node', 'Table', 'Font');
477
    my (@column_names, $columns, $table_data, $data_limit, $hit_limit);
478

479
    if( $data_limit = $self->config_get('DisplayData') ) {
480 481
        my $adaptor = $self->dba->get_NakedTableAdaptor();
        $adaptor->table_name( $table_name );
482

483 484
        @column_names = sort keys %{$adaptor->column_set};
        $columns = scalar(@column_names);
485 486 487 488 489 490
        $table_data = $adaptor->fetch_all( 'LIMIT '.($data_limit+1) );

        if(scalar(@$table_data)>$data_limit) {
            pop @$table_data;
            $hit_limit = 1;
        }
491 492 493 494 495 496 497 498 499 500
    }

    my $table_label = '<<table border="0" cellborder="0" cellspacing="0" cellpadding="1"><tr><td colspan="'.($columns||1).'">'.$table_name.'</td></tr>';

    if( $self->config_get('DisplayData') ) {
        $table_label .= '<tr><td colspan="'.$columns.'"> </td></tr>';
        $table_label .= '<tr>'.join('', map { qq{<td bgcolor="lightblue" border="1">$_</td>} } @column_names).'</tr>';
        foreach my $row (@$table_data) {
            $table_label .= '<tr>'.join('', map { qq{<td>$_</td>} } @{$row}{@column_names}).'</tr>';
        }
501 502 503
        if($hit_limit) {
            $table_label  .= qq{<tr><td colspan="$columns">[ more data ]</td></tr>};
        }
504 505 506 507 508 509 510 511 512
    }
    $table_label .= '</table>>';

    $self->graph()->add_node( $table_node, 
        label => $table_label,
        shape => 'record',
        fontname => $node_fontname,
        color => $self->config_get('Node', 'Table', 'Colour'),
    );
513 514
}

Leo Gordon's avatar
Leo Gordon committed
515
1;