Commit dbaad5bf authored by Leo Gordon's avatar Leo Gordon
Browse files

Meadow branch merged into trunk

parent 004f42af
......@@ -16,6 +16,23 @@ Summary:
Bio::EnsEMBL::Analysis::RunnableDB perl wrapper objects as nodes/blocks in
the graphs but could be adapted more generally.
13 July, 2009 : Leo Gordon
Merging the "Meadow" code from this March' development branch.
Because it separates LSF-specific code from higher level, it will be easier to update.
-------------------------------------------------------------------------------------------------------
Albert, sorry - in the process of merging into the development branch I had to remove your HIGHMEM code.
I hope it is a temporary measure and we will be having hive-wide queue control soon.
If not - you can restore the pre-merger state by updating with the following command:
cvs update -r lg4_pre_merger_20090713
('maximise_concurrency' option was carried over)
-------------------------------------------------------------------------------------------------------
3 April, 2009 : Albert Vilella
Added a new maximise_concurrency 1/0 option. When set to 1, it will
......
......@@ -482,12 +482,11 @@ sub claim_jobs_for_worker {
my $uuid = $ug->create();
my $claim = $ug->to_string( $uuid );
#print("claiming jobs for hive_id=", $worker->hive_id, " with uuid $claim\n");
my $status = 'READY';
$status = 'HIGHMEM' if (defined($worker->{HIGHMEM}));
my $sql_base = "UPDATE analysis_job SET job_claim='$claim'".
" , hive_id='". $worker->hive_id ."'".
" , status='CLAIMED'".
" WHERE job_claim='' and status='" . $status . "'".
" WHERE job_claim='' and status='READY'".
" AND analysis_id='" .$worker->analysis->dbID. "'";
my $sql_virgin = $sql_base .
......@@ -608,20 +607,6 @@ sub reset_dead_job_by_dbID {
#print(" done update BROKEN jobs\n");
}
sub reset_highmem_job_by_dbID {
my $self = shift;
my $job_id = shift;
#added hive_id index to analysis_job table which made this operation much faster
my $sql;
#first just reset the claimed jobs, these don't need a retry_count index increment
$sql = "UPDATE analysis_job SET job_claim='', status='HIGHMEM'".
" WHERE analysis_job_id=$job_id";
$self->dbc->do($sql);
#print(" done update CLAIMED\n");
}
=head2 reset_job_by_dbID
......
......@@ -93,20 +93,21 @@ sub fetch_by_needed_workers {
my $maximise_concurrency = shift;
my $constraint = "ast.num_required_workers>0 AND ast.status in ('READY','WORKING')";
my $first_order_by;
my $order_by;
if ($maximise_concurrency) {
$first_order_by = 'ORDER BY num_running_workers';
# print STDERR "###> Maximising concurrency\n";
$order_by = 'ORDER BY num_running_workers';
} else {
$first_order_by = 'ORDER BY num_required_workers DESC';
$order_by = 'ORDER BY num_required_workers DESC';
}
$order_by .= ', hive_capacity DESC, analysis_id';
if($limit) {
$self->_final_clause("$first_order_by, hive_capacity DESC, analysis_id LIMIT $limit");
} else {
$self->_final_clause("$first_order_by, hive_capacity DESC, analysis_id");
$order_by .= " LIMIT $limit";
}
$self->_final_clause($order_by);
my $results = $self->_generic_fetch($constraint);
$self->_final_clause(""); #reset final clause for other fetches
$self->_final_clause(''); #reset final clause for other fetches
return $results;
}
......@@ -218,7 +219,7 @@ sub update {
$sql .= ",num_running_workers=" . $stats->num_running_workers();
$sql .= ",num_required_workers=" . $stats->num_required_workers();
$sql .= ",last_update=NOW()";
$sql .= ",sync_lock=''";
$sql .= ",sync_lock='0'";
$sql .= " WHERE analysis_id='".$stats->analysis_id."' ";
my $sth = $self->prepare($sql);
......
......@@ -108,7 +108,6 @@ sub create_new_worker {
$analysis_id = $job->analysis_id if(defined($job));
my $analysisStats;
if($analysis_id) {
$analysisStats = $analStatsDBA->fetch_by_analysis_id($analysis_id);
$self->safe_synchronize_AnalysisStats($analysisStats);
......@@ -167,6 +166,7 @@ sub register_worker_death {
my ($self, $worker) = @_;
return unless($worker);
# if called without a defined cause_of_death, assume catastrophic failure
$worker->cause_of_death('FATALITY') unless(defined($worker->cause_of_death));
unless ($worker->cause_of_death() eq "HIVE_OVERLOAD") {
......@@ -352,6 +352,7 @@ sub fetch_failed_workers {
=head2 synchronize_hive
Arg [1] : $this_analysis (optional)
Example : $queen->synchronize_hive();
Description: Runs through all analyses in the system and synchronizes
the analysis_stats summary with the states in the analysis_job
......@@ -363,19 +364,22 @@ sub fetch_failed_workers {
=cut
sub synchronize_hive {
my $self = shift;
my $self = shift;
my $this_analysis = shift; # optional parameter
my $start_time = time();
my $allAnalysis = $self->db->get_AnalysisAdaptor->fetch_all;
print("analyze ", scalar(@$allAnalysis), "\n");
foreach my $analysis (@$allAnalysis) {
my $stats = $analysis->stats;
$self->synchronize_AnalysisStats($stats);
my $list_of_analyses = $this_analysis ? [$this_analysis] : $self->db->get_AnalysisAdaptor->fetch_all;
print "Synchronizing the hive (".scalar(@$list_of_analyses)." analyses this time) \n";
foreach my $analysis (@$list_of_analyses) {
$self->synchronize_AnalysisStats($analysis->stats);
}
foreach my $analysis (@$allAnalysis) {
foreach my $analysis (@$list_of_analyses) {
$self->check_blocking_control_rules_for_AnalysisStats($analysis->stats);
}
print((time() - $start_time), " secs to synchronize_hive\n");
}
......@@ -436,13 +440,6 @@ sub synchronize_AnalysisStats {
my $self = shift;
my $analysisStats = shift;
# Trying to make hive not synchronize if there is a high load in the
# server, e.g. during blasts (max 450 workers). The best thing I
# could find is the combination of these two numbers
if (($self->get_hive_current_load("silent") > 0.9) && $self->get_num_running_workers("silent") > 400) {
return $analysisStats;
}
return $analysisStats unless($analysisStats);
return $analysisStats unless($analysisStats->analysis_id);
......@@ -453,12 +450,8 @@ sub synchronize_AnalysisStats {
$analysisStats->failed_job_count(0);
$analysisStats->num_required_workers(0);
# my $sql = "SELECT status, count(*) FROM analysis_job ".
# "WHERE analysis_id=? GROUP BY status";
# This should be better in terms of performance
# http://www.mysqlperformanceblog.com/2007/08/16/how-much-overhead-is-caused-by-on-disk-temporary-tables/
my $sql = "SELECT status, count(status) FROM analysis_job ".
"WHERE analysis_id=? GROUP BY status ORDER BY NULL LIMIT 10";
my $sql = "SELECT status, count(*) FROM analysis_job ".
"WHERE analysis_id=? GROUP BY status";
my $sth = $self->prepare($sql);
$sth->execute($analysisStats->analysis_id);
......@@ -588,8 +581,7 @@ sub get_num_failed_analyses
sub get_hive_current_load {
my $self = shift;
my $silent = shift;
my $sql = "SELECT /*! SQL_BUFFER_RESULT */ sum(1/analysis_stats.hive_capacity) FROM hive, analysis_stats ".
my $sql = "SELECT sum(1/analysis_stats.hive_capacity) FROM hive, analysis_stats ".
"WHERE hive.analysis_id=analysis_stats.analysis_id and cause_of_death ='' ".
"AND analysis_stats.hive_capacity>0";
my $sth = $self->prepare($sql);
......@@ -597,22 +589,20 @@ sub get_hive_current_load {
(my $load)=$sth->fetchrow_array();
$sth->finish;
$load=0 unless($load);
print("current hive load = $load\n") unless (defined($silent));
print("*") if ($silent eq 'silent');
print("current hive load = $load\n");
return $load;
}
sub get_num_running_workers {
my $self = shift;
my $silent = shift;
my $sql = "SELECT count(*) FROM hive WHERE cause_of_death =''";
my $sth = $self->prepare($sql);
$sth->execute();
(my $runningCount)=$sth->fetchrow_array();
$sth->finish;
$runningCount=0 unless($runningCount);
print("current hive num_running_workers = $runningCount\n") unless (defined($silent));
print("current hive num_running_workers = $runningCount\n");
return $runningCount;
}
......@@ -688,11 +678,36 @@ sub get_num_needed_workers {
return $numWorkers;
}
sub get_needed_workers_failed_analyses_resync_if_necessary {
my ($self, $this_analysis) = @_;
my $runCount = $self->get_num_running_workers();
my $load = $self->get_hive_current_load();
my $worker_count = $self->get_num_needed_workers($this_analysis);
my $failed_analyses = $self->get_num_failed_analyses($this_analysis);
if($load==0 and $worker_count==0 and $runCount==0) {
print "*** nothing is running and nothing to do => perform a hard resync\n" ;
$self->synchronize_hive($this_analysis);
$worker_count = $self->get_num_needed_workers($this_analysis);
$failed_analyses = $self->get_num_failed_analyses($this_analysis);
if($worker_count==0) {
if($failed_analyses==0) {
print "Nothing left to do".($this_analysis ? (' for analysis '.$this_analysis->logic_name) : '').". DONE!!\n\n";
}
}
}
return ($worker_count, $failed_analyses);
}
sub get_hive_progress
{
my $self = shift;
my $sql = "SELECT /*! SQL_BUFFER_RESULT */ sum(done_job_count), sum(failed_job_count), sum(total_job_count), ".
my $sql = "SELECT sum(done_job_count), sum(failed_job_count), sum(total_job_count), ".
"sum(unclaimed_job_count * analysis_stats.avg_msec_per_job)/1000/60/60 ".
"FROM analysis_stats";
my $sth = $self->prepare($sql);
......@@ -707,26 +722,24 @@ sub get_hive_progress
my $remaining = $total - $done - $failed;
printf("hive %1.3f%% complete (< %1.3f CPU_hrs) (%d todo + %d done + %d failed = %d total)\n",
$completed, $cpuhrs, $remaining, $done, $failed, $total);
return $done, $total, $cpuhrs;
return $remaining;
}
sub print_hive_status
{
my $self = shift;
$self->print_analysis_status;
$self->print_running_worker_status;
sub print_hive_status {
my ($self, $this_analysis) = @_;
$self->print_analysis_status($this_analysis);
$self->print_running_worker_status;
}
sub print_analysis_status
{
my $self = shift;
sub print_analysis_status {
my ($self, $this_analysis) = @_;
my $allStats = $self->db->get_AnalysisStatsAdaptor->fetch_all();
foreach my $analysis_stats (@{$allStats}) {
$analysis_stats->print_stats($self->{'verbose_stats'});
}
my $list_of_analyses = $this_analysis ? [$this_analysis] : $self->db->get_AnalysisAdaptor->fetch_all;
foreach my $analysis (sort {$a->dbID <=> $b->dbID} @$list_of_analyses) {
$analysis->stats->print_stats($self->{'verbose_stats'});
}
}
......@@ -779,6 +792,24 @@ sub monitor
$sth->execute();
}
=head2 register_all_workers_dead
Example : $queen->register_all_workers_dead();
Description: Registers all workers dead
Exceptions : none
Caller : beekeepers and other external processes
=cut
sub register_all_workers_dead {
my $self = shift;
my $overdueWorkers = $self->fetch_overdue_workers(0);
foreach my $worker (@{$overdueWorkers}) {
$self->register_worker_death($worker);
}
}
#
# INTERNAL METHODS
......
......@@ -4,199 +4,229 @@ use warnings;
use strict;
use DBI;
use Getopt::Long;
use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Hive::Worker;
use Bio::EnsEMBL::Hive::Queen;
use Bio::EnsEMBL::Hive::URLFactory;
use Sys::Hostname;
use Bio::EnsEMBL::Hive::DBSQL::AnalysisCtrlRuleAdaptor;
Bio::EnsEMBL::Registry->no_version_check(1);
# ok this is a hack, but I'm going to pretend I've got an object here
# by creating a blessed hash ref and passing it around like an object
# this is to avoid using global variables in functions, and to consolidate
# the globals into a nice '$self' package
my $self = bless {};
$self->{'db_conf'} = {};
$self->{'db_conf'}->{'-user'} = 'ensro';
$self->{'db_conf'}->{'-port'} = 3306;
$self->{'max_loops'} = 0; #unlimited
$self->{'beekeeper_type'} = 'LSF';
$self->{'local_cpus'} = 2;
$| = 1;
my $conf_file;
my ($help, $host, $user, $pass, $dbname, $port, $adaptor, $url);
my ($job_limit, $lifespan, $batch_size);
my $maximise_concurrency = 0;
my $loopit=0;
my $worker_limit = 50;
my $sleep_time = 2;
my $sync=0;
my $highmem=undef;
my $local=undef;
$self->{'overdue_limit'} = 60; #minutes
$self->{'no_analysis_stats'} = undef;
$self->{'show_worker_stats'} = undef;
$self->{'verbose_stats'} = 1;
$self->{'lsf_options'} = "";
$self->{'monitor'} = undef;
my $regfile = undef;
my $reg_alias = 'hive';
GetOptions('help' => \$help,
'url=s' => \$url,
'conf=s' => \$conf_file,
'dbhost=s' => \$host,
'dbport=i' => \$port,
'dbuser=s' => \$user,
'dbpass=s' => \$pass,
'dbname=s' => \$dbname,
'local' => \$local,
'lsf' => \$self->{'lsf_mode'},
'dead' => \$self->{'check_for_dead'},
'killworker=i' => \$self->{'kill_worker_id'},
'overdue' => \$self->{'overdue_limit'},
'alldead' => \$self->{'all_dead'},
'run' => \$self->{'run'},
'run_job_id=i' => \$self->{'run_job_id'},
'lifespan=i' => \$lifespan,
'maximise_concurrency=i' => \$maximise_concurrency,
'highmem' => \$highmem,
'jlimit=i' => \$job_limit,
'wlimit=i' => \$worker_limit,
'batch_size=i' => \$batch_size,
'loop' => \$loopit,
'no_pend' => \$self->{'no_pend_adjust'},
'sync' => \$sync,
'no_analysis_stats' => \$self->{'no_analysis_stats'},
'verbose_stats=i' => \$self->{'verbose_stats'},
'worker_stats=i' => \$self->{'show_worker_stats'},
'sleep=f' => \$sleep_time,
'logic_name=s' => \$self->{'logic_name'},
'failed_jobs' => \$self->{'show_failed_jobs'},
'reset_job_id=i' => \$self->{'reset_job_id'},
'reset_all|reset_all_jobs_for_analysis=s' => \$self->{'reset_all_jobs_for_analysis'},
'delete|remove=s' => \$self->{'remove_analysis_id'}, # careful
'lsf_options=s' => \$self->{'lsf_options'},
'job_output=i' => \$self->{'show_job_output'},
'regfile=s' => \$regfile,
'regname=s' => \$reg_alias,
'monitor!' => \$self->{'monitor'},
);
if ($help) { usage(); }
if($local) {
$self->{'beekeeper_type'} ='LOCAL';
}
parse_conf($self, $conf_file);
use Bio::EnsEMBL::Hive::Meadow::LSF;
use Bio::EnsEMBL::Hive::Meadow::LOCAL;
main();
sub main {
$| = 1;
Bio::EnsEMBL::Registry->no_version_check(1);
# ok this is a hack, but I'm going to pretend I've got an object here
# by creating a hash ref and passing it around like an object
# this is to avoid using global variables in functions, and to consolidate
# the globals into a nice '$self' package
my $self = {};
$self->{'db_conf'} = {
-host => '',
-port => 3306,
-user => 'ensro',
-pass => '',
-dbname => '',
};
my ($help, $conf_file);
my $loopit = 0;
my $sync = 0;
my $local = 0;
my $show_failed_jobs = 0;
my $no_pend_adjust = 0;
my $worker_limit = 50;
my $local_cpus = 2;
my $lsf_options = '';
my $max_loops = 0; # not running by default
my $run = 0;
my $check_for_dead = 0;
my $all_dead = 0;
my $remove_analysis_id = 0;
my $job_id_for_output = 0;
my $show_worker_stats = 0;
my $kill_worker_id = 0;
my $reset_job_id = 0;
my $reset_all_jobs_for_analysis = 0;
$self->{'sleep_minutes'} = 2;
$self->{'overdue_minutes'} = 60;
$self->{'verbose_stats'} = 1;
$self->{'monitor'} = undef;
$self->{'reg_file'} = undef;
$self->{'reg_name'} = 'hive';
$self->{'maximise_concurrency'} = 0;
GetOptions('help' => \$help,
# connection parameters
'conf=s' => \$conf_file,
'regfile=s' => \$self->{'reg_file'},
'regname=s' => \$self->{'reg_name'},
'url=s' => \$self->{'url'},
'dbhost=s' => \$self->{'db_conf'}->{'-host'},
'dbport=i' => \$self->{'db_conf'}->{'-port'},
'dbuser=s' => \$self->{'db_conf'}->{'-user'},
'dbpass=s' => \$self->{'db_conf'}->{'-pass'},
'dbname=s' => \$self->{'db_conf'}->{'-dbname'},
# loop control
'loop' => \$loopit,
'max_loops=i' => \$max_loops,
'run' => \$run,
'run_job_id=i' => \$self->{'run_job_id'},
'sleep=f' => \$self->{'sleep_minutes'},
# meadow control
'local!' => \$local,
'local_cpus=i' => \$local_cpus,
'wlimit=i' => \$worker_limit,
'no_pend' => \$no_pend_adjust,
'lsf_options=s' => \$lsf_options,
# worker control
'jlimit=i' => \$self->{'job_limit'},
'batch_size=i' => \$self->{'batch_size'},
'logic_name=s' => \$self->{'logic_name'},
'maximise_concurrency' => \$self->{'maximise_concurrency'},
# other commands/options
'sync' => \$sync,
'dead' => \$check_for_dead,
'killworker=i' => \$kill_worker_id,
'overdue' => \$self->{'overdue_minutes'},
'alldead' => \$all_dead,
'no_analysis_stats' => \$self->{'no_analysis_stats'},
'verbose_stats=i' => \$self->{'verbose_stats'},
'worker_stats' => \$show_worker_stats,
'failed_jobs' => \$show_failed_jobs,
'reset_job_id=i' => \$reset_job_id,
'reset_all|reset_all_jobs_for_analysis=s' => \$reset_all_jobs_for_analysis,
'delete|remove=s' => \$remove_analysis_id, # careful
'job_output=i' => \$job_id_for_output,
'monitor!' => \$self->{'monitor'},
);
if ($help) { usage(); }
if($local) {
$self->{'meadow'} = Bio::EnsEMBL::Hive::Meadow::LOCAL->new();
$self->{'meadow'} -> total_running_workers_limit($local_cpus);
} else {
$self->{'meadow'} = Bio::EnsEMBL::Hive::Meadow::LSF->new();
$self->{'meadow'} -> lsf_options($lsf_options);
}
$self->{'meadow'} -> pending_adjust(not $no_pend_adjust);
$self->{'meadow'} -> submitted_workers_limit($worker_limit);
if($self->{'run'} or $self->{'run_job_id'}) {
$loopit = 1;
$self->{'max_loops'} = 1;
} elsif ($loopit) {
$self->{'monitor'} = 1 if (!defined($self->{'monitor'}));
}
parse_conf($self, $conf_file);
my $DBA;
if($regfile) {
Bio::EnsEMBL::Registry->load_all($regfile);
$DBA = Bio::EnsEMBL::Registry->get_DBAdaptor($reg_alias, 'hive');
}
elsif($url) {
$DBA = Bio::EnsEMBL::Hive::URLFactory->fetch($url);
die("Unable to connect to $url\n") unless($DBA);
} else {
if($host) { $self->{'db_conf'}->{'-host'} = $host; }
if($port) { $self->{'db_conf'}->{'-port'} = $port; }
if($dbname) { $self->{'db_conf'}->{'-dbname'} = $dbname; }
if($user) { $self->{'db_conf'}->{'-user'} = $user; }
if($pass) { $self->{'db_conf'}->{'-pass'} = $pass; }
unless(defined($self->{'db_conf'}->{'-host'})
and defined($self->{'db_conf'}->{'-user'})
and defined($self->{'db_conf'}->{'-dbname'}))
{
print "\nERROR : must specify host, user, and database to connect\n\n";
usage();
}
if($run or $self->{'run_job_id'}) {
$max_loops = 1;
} elsif ($loopit) {
unless($max_loops) {
$max_loops = -1; # unlimited
}
unless(defined($self->{'monitor'})) {
$self->{'monitor'} = 1;
}
}
# connect to database specified
$DBA = new Bio::EnsEMBL::Hive::DBSQL::DBAdaptor(%{$self->{'db_conf'}});
$url = $DBA->dbc->url;
}
$self->{'dba'} = $DBA;
my $queen = $DBA->get_Queen;
$queen->{maximise_concurrency} = 1 if ($maximise_concurrency);
$queen->{HIGHMEM} = 1 if ($highmem);
$self->{name} = $DBA->get_MetaContainer->list_value_by_key("name")->[0];
if($self->{'reg_file'}) {
Bio::EnsEMBL::Registry->load_all($self->{'reg_file'});
$self->{'dba'} = Bio::EnsEMBL::Registry->get_DBAdaptor($self->{'reg_name'}, 'hive');
} elsif($self->{'url'}) {
$self->{'dba'} = Bio::EnsEMBL::Hive::URLFactory->fetch($self->{'url'}) || die("Unable to connect to $self->{'url'}\n");
} elsif ( $self->{'db_conf'}->{'-host'}
and $self->{'db_conf'}->{'-user'}
and $self->{'db_conf'}->{'-dbname'}) { # connect to database specified
$self->{'dba'} = new Bio::EnsEMBL::Hive::DBSQL::DBAdaptor(%{$self->{'db_conf'}});
$self->{'url'} = $self->{'dba'}->dbc->url;
} else {
print "\nERROR : Connection parameters (regfile+regname, url or dbhost+dbuser+dbname) need to be specified\n\n";
usage();
}
if($self->{'reset_job_id'}) { $queen->reset_and_fetch_job_by_dbID($self->{'reset_job_id'}); };
if($self->{'show_job_output'}) { print_job_output($self); }
my $queen = $self->{'dba'}->get_Queen;
$queen->{'maximise_concurrency'} = 1 if ($self->{'maximise_concurrency'});
$queen->{'verbose_stats'} = $self->{'verbose_stats'};
if($self->{'reset_all_jobs_for_analysis'}) {
reset_all_jobs_for_analysis($self, $self->{'reset_all_jobs_for_analysis'})
}