Commit fb990f2a authored by Leo Gordon's avatar Leo Gordon
Browse files

substituted the overloaded legacy 'analysis' table by a slimmer 'analysis_base'

parent c3a7441f
No preview for this file type
docs/hive_schema.png

253 KB | W: | H:

docs/hive_schema.png

227 KB | W: | H:

docs/hive_schema.png
docs/hive_schema.png
docs/hive_schema.png
docs/hive_schema.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -65,14 +65,13 @@ use Sys::Hostname;
use Bio::EnsEMBL::Utils::Argument;
use Bio::EnsEMBL::Utils::Exception;
use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor;
use Bio::EnsEMBL::Hive::DBSQL::AnalysisStatsAdaptor;
use Bio::EnsEMBL::Hive::DBSQL::DataflowRuleAdaptor;
use Bio::EnsEMBL::Hive::DBSQL::AnalysisCtrlRuleAdaptor;
use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor;
use Bio::EnsEMBL::Hive::Analysis;
use Bio::EnsEMBL::Hive::Extensions;
use Bio::EnsEMBL::Hive::Queen;
use Bio::EnsEMBL::Hive::URLFactory;
......
=pod
=head1 NAME
Bio::EnsEMBL::Hive::Analysis
=head1 SYNOPSIS
=head1 DESCRIPTION
An Analysis object represents a "stage" of the Hive pipeline that groups together
all jobs that share the same module and the same common parameters.
Individual Jobs are said to "belong" to an Analysis.
Control rules unblock when their condition Analyses are done.
=head1 CONTACT
Please contact ehive-users@ebi.ac.uk mailing list with questions/suggestions.
=cut
package Bio::EnsEMBL::Hive::Analysis;
use strict;
use Bio::EnsEMBL::Utils::Argument; # import 'rearrange()'
use base ( 'Bio::EnsEMBL::Storable', # inherit dbID(), adaptor() and new() methods
);
sub new {
my $class = shift @_;
my $self = $class->SUPER::new( @_ ); # deal with Storable stuff
my ($logic_name, $module, $parameters) =
rearrange([qw(logic_name module parameters) ], @_);
$self->logic_name($logic_name) if($logic_name);
$self->module($module) if($module);
$self->parameters($parameters) if($parameters);
return $self;
}
sub logic_name {
my $self = shift @_;
if(@_) {
$self->{'_logic_name'} = shift @_;
}
return $self->{'_logic_name'};
}
sub module {
my $self = shift @_;
if(@_) {
$self->{'_module'} = shift @_;
}
return $self->{'_module'};
}
sub parameters {
my $self = shift @_;
if(@_) {
$self->{'_parameters'} = shift @_;
}
return $self->{'_parameters'};
}
=head2 process
Arg [1] : none
Example : $process = $analysis->process;
Description: construct a Process object from the $analysis->module name
Returntype : Bio::EnsEMBL::Hive::Process subclass
Exceptions : none
Caller : general
=cut
sub process {
my $self = shift;
my $process_class = $self->module
or die "Analysis '".$self->logic_name."' does not have its 'module' defined";
if($process_class!~/::/) {
$process_class = 'Bio::EnsEMBL::Hive::Runnable::'.$process_class;
}
my $file = $process_class;
$file =~ s/::/\//g;
require "${file}.pm";
my $process_object = $process_class->new(
-db => $self->adaptor->db,
-input_id => '1',
-analysis => $self,
);
return $process_object;
}
=head2 url
Arg [1] : none
Example : $url = $analysis->url;
Description: Constructs a URL string for this database connection
Follows the general URL rules.
Returntype : string of format
mysql://<user>:<pass>@<host>:<port>/<dbname>/analysis?logic_name=<name>
Exceptions : none
Caller : general
=cut
sub url {
my $self = shift;
return undef unless($self->adaptor);
return $self->adaptor->db->dbc->url . '/analysis?logic_name=' . $self->logic_name;
}
=head2 stats
Arg [1] : none
Example : $stats = $analysis->stats;
Description: returns the AnalysisStats object associated with this Analysis
object. Does not cache, but pull from database by using the
Analysis objects adaptor->db.
Returntype : Bio::EnsEMBL::Hive::AnalysisStats object
Exceptions : none
Caller : general
=cut
sub stats {
my $self = shift;
# Not cached internally since we want it to always be in sync with the database.
# Otherwise the user application would need to be aware of the sync state and send explicit 'sync' calls.
my $stats = $self->adaptor->db->get_AnalysisStatsAdaptor->fetch_by_analysis_id($self->dbID);
return $stats;
}
sub toString {
my $self = shift @_;
return (ref($self).': '.join(', ', map { $_.'="'.$self->$_().'"' } qw(dbID logic_name module parameters) ));
}
1;
......@@ -131,11 +131,11 @@ sub condition_analysis_url {
=head2 ctrled_analysis
Arg[1] : (optional) Bio::EnsEMBL::Analysis object
Arg[1] : (optional) Bio::EnsEMBL::Hive::Analysis object
Usage : $self->ctrled_analysis($anal);
Function: Get/set method for the analysis which will be BLOCKED until all
of its condition analyses are 'DONE'
Returns : Bio::EnsEMBL::Analysis
Returns : Bio::EnsEMBL::Hive::Analysis
=cut
......@@ -144,9 +144,9 @@ sub ctrled_analysis {
# setter mode
if( defined $analysis ) {
unless ($analysis->isa('Bio::EnsEMBL::Analysis')) {
unless ($analysis->isa('Bio::EnsEMBL::Hive::Analysis')) {
throw(
"ctrled_analysis arg must be a [Bio::EnsEMBL::Analysis]".
"ctrled_analysis arg must be a [Bio::EnsEMBL::Hive::Analysis]".
"not a [$analysis]");
}
$self->{'_ctrled_analysis'} = $analysis;
......@@ -158,8 +158,7 @@ sub ctrled_analysis {
and defined($self->ctrled_analysis_id)
and defined($self->adaptor))
{
$self->{'_ctrled_analysis'} =
$self->adaptor->db->get_AnalysisAdaptor->fetch_by_dbID($self->ctrled_analysis_id);
$self->{'_ctrled_analysis'} = $self->adaptor->db->get_AnalysisAdaptor->fetch_by_dbID($self->ctrled_analysis_id);
}
return $self->{'_ctrled_analysis'};
}
......@@ -167,11 +166,11 @@ sub ctrled_analysis {
=head2 condition_analysis
Arg[1] : (optional) Bio::EnsEMBL::Analysis object
Arg[1] : (optional) Bio::EnsEMBL::Hive::Analysis object
Usage : $self->condition_analysis($anal);
Function: Get/set method for the analysis which must be 'DONE' in order for
the controlled analysis to be un-BLOCKED
Returns : Bio::EnsEMBL::Analysis
Returns : Bio::EnsEMBL::Hive::Analysis
=cut
......@@ -179,9 +178,9 @@ sub condition_analysis {
my ($self,$analysis) = @_;
if( defined $analysis ) {
unless ($analysis->isa('Bio::EnsEMBL::Analysis')) {
unless ($analysis->isa('Bio::EnsEMBL::Hive::Analysis')) {
throw(
"condition_analysis arg must be a [Bio::EnsEMBL::Analysis]".
"condition_analysis arg must be a [Bio::EnsEMBL::Hive::Analysis]".
"not a [$analysis]");
}
$self->{'_condition_analysis'} = $analysis;
......
......@@ -28,9 +28,9 @@ package Bio::EnsEMBL::Hive::AnalysisStats;
use strict;
use Scalar::Util ('weaken');
use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::Worker;
#use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::Analysis;
#use Bio::EnsEMBL::Hive::Worker;
## Minimum amount of time in msec that a worker should run before reporting
## back to the hive. This is used when setting the batch_size automatically.
......
......@@ -8,9 +8,12 @@
$analysis_adaptor = $db_adaptor->get_AnalysisAdaptor;
$analysis_adaptor = $analysis_object->adaptor;
=head1 DESCRIPTION
This module extends EnsEMBL Core's AnalysisAdaptor, adding some Hive-specific stuff.
Module to encapsulate all db access for persistent class Analysis.
There should be just one such adaptor per application and database connection.
=head1 CONTACT
......@@ -22,10 +25,25 @@
package Bio::EnsEMBL::Hive::DBSQL::AnalysisAdaptor;
use strict;
use Bio::EnsEMBL::Hive::Analysis;
use Bio::EnsEMBL::Hive::URLFactory;
use base ('Bio::EnsEMBL::DBSQL::AnalysisAdaptor');
use base ('Bio::EnsEMBL::Hive::DBSQL::ObjectAdaptor');
use Bio::EnsEMBL::Hive::URLFactory;
sub default_table_name {
return 'analysis_base';
}
sub default_insertion_method {
return 'INSERT';
}
sub object_class {
return 'Bio::EnsEMBL::Hive::Analysis';
}
=head2 fetch_by_logic_name_or_url
......
......@@ -57,7 +57,7 @@ use base ('Bio::EnsEMBL::DBSQL::BaseAdaptor');
=head2 CreateNewJob
Args : -input_id => string of input_id which will be passed to run the job (or a Perl hash that will be automagically stringified)
-analysis => Bio::EnsEMBL::Analysis object from a database
-analysis => Bio::EnsEMBL::Hive::Analysis object from a database
-block => int(0,1) set blocking state of job (default = 0)
-input_job_id => (optional) job_id of job that is creating this
job. Used purely for book keeping.
......@@ -86,8 +86,8 @@ sub CreateNewJob {
throw("must define input_id") unless($input_id);
throw("must define analysis") unless($analysis);
throw("analysis must be [Bio::EnsEMBL::Analysis] not a [$analysis]")
unless($analysis->isa('Bio::EnsEMBL::Analysis'));
throw("analysis must be [Bio::EnsEMBL::Hive::Analysis] not a [$analysis]")
unless($analysis->isa('Bio::EnsEMBL::Hive::Analysis'));
throw("analysis must have adaptor connected to database")
unless($analysis->adaptor and $analysis->adaptor->db);
throw("Please specify prev_job object instead of input_job_id if available") if ($prev_job_id); # 'obsolete' message
......
......@@ -209,7 +209,7 @@ sub primary_key_constraint {
sub fetch_by_dbID {
my $self = shift @_; # the rest in @_ should be primary_key column values
return $self->fetch_all( $self->primary_key_constraint( @_ ) );
return $self->fetch_all( $self->primary_key_constraint( @_ ), 1 );
}
......
......@@ -183,8 +183,8 @@ sub to_analysis_url {
Usage : $self->from_analysis($analysis);
Function: Get/set method for the condition analysis object of this rule.
Returns : Bio::EnsEMBL::Analysis
Args : Bio::EnsEMBL::Analysis
Returns : Bio::EnsEMBL::Hive::Analysis
Args : Bio::EnsEMBL::Hive::Analysis
=cut
......@@ -193,9 +193,9 @@ sub from_analysis {
# setter mode
if( defined $analysis ) {
unless ($analysis->isa('Bio::EnsEMBL::Analysis')) {
unless ($analysis->isa('Bio::EnsEMBL::Hive::Analysis')) {
throw(
"from_analysis arg must be a [Bio::EnsEMBL::Analysis]".
"from_analysis arg must be a [Bio::EnsEMBL::Hive::Analysis]".
"not a [$analysis]");
}
$self->{'_from_analysis'} = $analysis;
......@@ -218,8 +218,8 @@ sub from_analysis {
Usage : $self->to_analysis($analysis);
Function: Get/set method for the goal analysis object of this rule.
Returns : Bio::EnsEMBL::Analysis
Args : Bio::EnsEMBL::Analysis
Returns : Bio::EnsEMBL::Hive::Analysis
Args : Bio::EnsEMBL::Hive::Analysis
=cut
......
......@@ -47,10 +47,12 @@ package Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Argument; # import 'rearrange()'
use Bio::EnsEMBL::Hive::Utils 'stringify'; # import 'stringify()'
use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor;
use Bio::EnsEMBL::Hive::Analysis;
use Bio::EnsEMBL::Hive::Extensions;
use base ('Bio::EnsEMBL::Hive::DependentOptions');
......@@ -377,8 +379,8 @@ sub run {
my %seen_logic_name = ();
foreach my $aha (@{$self->pipeline_analyses}) {
my ($logic_name, $module, $parameters_hash, $program_file, $input_ids, $blocked, $batch_size, $hive_capacity, $failed_job_tolerance, $max_retry_count, $can_be_empty, $rc_id, $rc_name, $priority) =
rearrange([qw(logic_name module parameters program_file input_ids blocked batch_size hive_capacity failed_job_tolerance max_retry_count can_be_empty rc_id rc_name priority)], %$aha);
my ($logic_name, $module, $parameters_hash, $input_ids, $blocked, $batch_size, $hive_capacity, $failed_job_tolerance, $max_retry_count, $can_be_empty, $rc_id, $rc_name, $priority) =
rearrange([qw(logic_name module parameters input_ids blocked batch_size hive_capacity failed_job_tolerance max_retry_count can_be_empty rc_id rc_name priority)], %$aha);
unless($logic_name) {
die "logic_name' must be defined in every analysis";
......@@ -412,11 +414,10 @@ sub run {
$rc_id = $rc->dbID();
}
$analysis = Bio::EnsEMBL::Analysis->new(
$analysis = Bio::EnsEMBL::Hive::Analysis->new(
-logic_name => $logic_name,
-module => $module,
-parameters => stringify($parameters_hash || {}), # have to stringify it here, because Analysis code is external wrt Hive code
-program_file => $program_file,
);
$analysis_adaptor->store($analysis);
......
......@@ -428,7 +428,7 @@ sub go_figure_dbc {
Usage : $self->analysis;
Function: Returns the Analysis object associated with this
instance of the Process.
Returns : Bio::EnsEMBL::Analysis object
Returns : Bio::EnsEMBL::Hive::Analysis object
=cut
......@@ -436,8 +436,8 @@ sub analysis {
my ($self, $analysis) = @_;
if($analysis) {
throw("Not a Bio::EnsEMBL::Analysis object")
unless ($analysis->isa("Bio::EnsEMBL::Analysis"));
throw("Not a Bio::EnsEMBL::Hive::Analysis object")
unless ($analysis->isa("Bio::EnsEMBL::Hive::Analysis"));
$self->{'_analysis'} = $analysis;
}
return $self->{'_analysis'};
......
......@@ -695,7 +695,7 @@ sub count_running_workers {
=head2 schedule_workers
Arg[1] : Bio::EnsEMBL::Analysis object (optional)
Arg[1] : Bio::EnsEMBL::Hive::Analysis object (optional)
Example : $count = $queen->schedule_workers();
Description: Runs through the analyses in the system which are waiting
for workers to be created for them. Calculates the maximum
......@@ -842,9 +842,12 @@ sub print_running_worker_counts {
my $self = shift;
print "\n===== Stats of live Workers according to the Queen: ======\n";
my $sql = "SELECT logic_name, count(*) FROM worker, analysis ".
"WHERE worker.analysis_id=analysis.analysis_id AND worker.cause_of_death='' ".
"GROUP BY worker.analysis_id";
my $sql = qq{ SELECT logic_name, count(*)
FROM worker
JOIN analysis_base USING(analysis_id)
WHERE worker.cause_of_death=''
GROUP BY worker.analysis_id
};
my $total_workers = 0;
my $sth = $self->prepare($sql);
......@@ -882,7 +885,7 @@ sub monitor {
sum(work_done/(UNIX_TIMESTAMP()-UNIX_TIMESTAMP(born)))/count(*), }
). qq{
group_concat(DISTINCT logic_name)
FROM worker left join analysis USING (analysis_id)
FROM worker left join analysis_base USING (analysis_id)
WHERE cause_of_death = ''
};
......
......@@ -24,11 +24,11 @@ See inline
=head1 AUTHOR
$Author: mm14 $
$Author: lg4 $
=head1 VERSION
$Revision: 1.21 $
$Revision: 1.22 $
=cut
......@@ -346,7 +346,7 @@ sub _dataflow_rules {
my ($from_node, $to_id, $to_node) = ( _analysis_node_name($from_analysis_id) );
# Different treatment for analyses and tables:
if(check_ref($to, 'Bio::EnsEMBL::Analysis')) {
if(check_ref($to, 'Bio::EnsEMBL::Hive::Analysis')) {
$to_id = $to->dbID();
$to_node = _analysis_node_name($to_id);
} elsif(check_ref($to, 'Bio::EnsEMBL::Hive::NakedTable')) {
......
......@@ -65,7 +65,7 @@ package Bio::EnsEMBL::Hive::Worker;
use strict;
use POSIX;
use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::Hive::Analysis;
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Utils::Argument;
use Bio::EnsEMBL::Utils::Exception;
......@@ -143,13 +143,13 @@ sub execute_writes {
=head2 analysis
Arg [1] : (optional) Bio::EnsEMBL::Analysis $value
Arg [1] : (optional) Bio::EnsEMBL::Hive::Analysis $value
Title : analysis
Usage : $value = $self->analysis;
$self->analysis($$analysis);
Description: Get/Set analysis object of this Worker
DefaultValue : undef
Returntype : Bio::EnsEMBL::Analysis object
Returntype : Bio::EnsEMBL::Hive::Analysis object
=cut
......@@ -158,8 +158,8 @@ sub analysis {
my $analysis = shift;
if(defined($analysis)) {
throw("analysis arg must be a [Bio::EnsEMBL::Analysis] not a [$analysis]")
unless($analysis->isa('Bio::EnsEMBL::Analysis'));
throw("analysis arg must be a [Bio::EnsEMBL::Hive::Analysis] not a [$analysis]")
unless($analysis->isa('Bio::EnsEMBL::Hive::Analysis'));
$self->{'_analysis'} = $analysis;
}
......
......@@ -171,13 +171,10 @@ sub create_analysis {
# No existing analysis with this logic_name. Create a new one.
print("creating analysis '$logic_name' to be computed using module '$module' with parameters '$parameters'\n");
$self->{_analysis} = Bio::EnsEMBL::Analysis->new (
-db => '',
-db_file => '',
-db_version => '1',
-parameters => $parameters,
$self->{_analysis} = Bio::EnsEMBL::Hive::Analysis->new (
-logic_name => $logic_name,
-module => $module,
-parameters => $parameters,
);
$DBA->get_AnalysisAdaptor()->store($self->{_analysis});
......
......@@ -58,7 +58,7 @@ sub main {
count(*) workers,
min(mem), avg(mem), max(mem),
min(swap), avg(swap), max(swap)
FROM analysis
FROM analysis_base
JOIN analysis_stats USING(analysis_id)
JOIN resource_class rc USING(resource_class_id)
LEFT JOIN worker USING(analysis_id)
......
# introducing the FOREIGN KEY constraints as a separate file (so that they could be optionally switched on or off):
ALTER TABLE analysis_description ADD FOREIGN KEY (analysis_id) REFERENCES analysis(analysis_id);
ALTER TABLE worker ADD FOREIGN KEY (analysis_id) REFERENCES analysis(analysis_id);
ALTER TABLE dataflow_rule ADD FOREIGN KEY (from_analysis_id) REFERENCES analysis(analysis_id);
ALTER TABLE analysis_ctrl_rule ADD FOREIGN KEY (ctrled_analysis_id) REFERENCES analysis(analysis_id);
ALTER TABLE job ADD FOREIGN KEY (analysis_id) REFERENCES analysis(analysis_id);
ALTER TABLE analysis_stats ADD FOREIGN KEY (analysis_id) REFERENCES analysis(analysis_id);
ALTER TABLE analysis_stats_monitor ADD FOREIGN KEY (analysis_id) REFERENCES analysis(analysis_id);
ALTER TABLE worker ADD FOREIGN KEY (analysis_id) REFERENCES analysis_base(analysis_id);
ALTER TABLE dataflow_rule ADD FOREIGN KEY (from_analysis_id) REFERENCES analysis_base(analysis_id);
ALTER TABLE analysis_ctrl_rule ADD FOREIGN KEY (ctrled_analysis_id) REFERENCES analysis_base(analysis_id);
ALTER TABLE job ADD FOREIGN KEY (analysis_id) REFERENCES analysis_base(analysis_id);
ALTER TABLE analysis_stats ADD FOREIGN KEY (analysis_id) REFERENCES analysis_base(analysis_id);
ALTER TABLE analysis_stats_monitor ADD FOREIGN KEY (analysis_id) REFERENCES analysis_base(analysis_id);
ALTER TABLE job ADD FOREIGN KEY (worker_id) REFERENCES worker(worker_id);
ALTER TABLE job_message ADD FOREIGN KEY (worker_id) REFERENCES worker(worker_id);
......
# Substitute the legacy 'analysis' table with the slimmer version 'analysis_base',
# but do not bother deleting the original table in the patch
# because of the many already established foreign key relationships.
CREATE TABLE analysis_base (
analysis_id int(10) unsigned NOT NULL AUTO_INCREMENT,
logic_name VARCHAR(40) NOT NULL,
module VARCHAR(255),
parameters TEXT,
PRIMARY KEY (analysis_id),
UNIQUE KEY logic_name_idx (logic_name)
) COLLATE=latin1_swedish_ci ENGINE=InnoDB;
INSERT INTO analysis_base (analysis_id, logic_name, module, parameters) SELECT analysis_id, logic_name, module, parameters FROM analysis;
......@@ -19,7 +19,7 @@ CREATE OR REPLACE VIEW progress AS
j.retry_count,
CASE WHEN j.status IS NULL THEN 0 ELSE count(*) END cnt,
job_id example_job_id
FROM analysis a LEFT JOIN job j USING (analysis_id)
FROM analysis_base a LEFT JOIN job j USING (analysis_id)
GROUP BY a.analysis_id, j.status, j.retry_count
ORDER BY a.analysis_id, j.status;
......@@ -35,7 +35,7 @@ CREATE OR REPLACE VIEW msg AS
SELECT a.analysis_id, a.logic_name, m.*
FROM job_message m
JOIN job j USING (job_id)
JOIN analysis a USING (analysis_id);
JOIN analysis_base a USING (analysis_id);
#### time an analysis or group of analyses (given by a name pattern) ######################################
......@@ -53,7 +53,7 @@ READS SQL DATA
(UNIX_TIMESTAMP(max(last_check_in))-UNIX_TIMESTAMP(min(born)))/60 AS measured_in_minutes,
(UNIX_TIMESTAMP(max(last_check_in))-UNIX_TIMESTAMP(min(born)))/3600 AS measured_in_hours,
(UNIX_TIMESTAMP(max(last_check_in))-UNIX_TIMESTAMP(min(born)))/3600/24 AS measured_in_days
FROM worker JOIN analysis USING (analysis_id)
FROM worker JOIN analysis_base USING (analysis_id)
WHERE logic_name like param_logic_name_pattern;