From cc6a319ae402bfc7a9f64e3d3e5400f759cb7204 Mon Sep 17 00:00:00 2001 From: Leo Gordon <lg4@ebi.ac.uk> Date: Mon, 9 May 2011 15:34:07 +0000 Subject: [PATCH] EXPERIMENTAL sqlite support in eHive --- .../Bio/EnsEMBL/Hive/DBSQL/AnalysisAdaptor.pm | 2 +- .../EnsEMBL/Hive/DBSQL/AnalysisDataAdaptor.pm | 2 +- .../EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm | 53 +-- .../Hive/DBSQL/AnalysisStatsAdaptor.pm | 15 +- modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm | 29 +- modules/Bio/EnsEMBL/Hive/Extensions.pm | 28 +- modules/Bio/EnsEMBL/Hive/NakedTable.pm | 2 +- .../Hive/PipeConfig/HiveGeneric_conf.pm | 71 ++- .../EnsEMBL/Hive/PipeConfig/LongMult_conf.pm | 14 +- modules/Bio/EnsEMBL/Hive/Queen.pm | 20 +- .../Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm | 4 +- modules/Bio/EnsEMBL/Hive/RunnableDB/SqlCmd.pm | 5 +- modules/Bio/EnsEMBL/Hive/URLFactory.pm | 32 +- sql/tables.sqlite | 408 ++++++++++++++++++ 14 files changed, 594 insertions(+), 91 deletions(-) create mode 100644 sql/tables.sqlite diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisAdaptor.pm index 97057685b..33d5617b6 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisAdaptor.pm @@ -36,7 +36,7 @@ sub fetch_by_logic_name_or_url { my $self = shift @_; my $logic_name_or_url = shift @_; - if($logic_name_or_url =~ m{^\w+://}) { + if($logic_name_or_url =~ m{^\w*://}) { return Bio::EnsEMBL::Hive::URLFactory->fetch($logic_name_or_url, $self->db); } else { return $self->fetch_by_logic_name($logic_name_or_url); diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisDataAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisDataAdaptor.pm index 3a8f52605..f206ee486 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisDataAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisDataAdaptor.pm @@ -71,7 +71,7 @@ sub store { my $sth2 = $self->prepare("INSERT INTO analysis_data (data) VALUES (?)"); $sth2->execute($data); - $data_id = $sth2->{'mysql_insertid'}; + $data_id = ($self->dbc->driver eq 'sqlite') ? $self->dbc->db_handle->func('last_insert_rowid') : $sth->{'mysql_insertid'}; $sth2->finish; return $data_id; diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm index c6c5322c7..fc92e91c3 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisJobAdaptor.pm @@ -99,16 +99,20 @@ sub CreateNewJob { $input_id = "_ext_input_analysis_data_id $input_data_id"; } - my $sql = q{INSERT ignore into job + my $dbc = $analysis->adaptor->db->dbc; + my $insertion_method = ($dbc->driver eq 'sqlite') ? 'INSERT OR IGNORE' : 'INSERT IGNORE'; + $insertion_method = 'INSERT'; # we are expecting this to fire + + my $status = $blocked ? 'BLOCKED' : 'READY'; + + my $sql = qq{$insertion_method INTO job (input_id, prev_job_id,analysis_id,status,semaphore_count,semaphored_job_id) VALUES (?,?,?,?,?,?)}; - my $status = $blocked ? 'BLOCKED' : 'READY'; - - my $dbc = $analysis->adaptor->db->dbc; my $sth = $dbc->prepare($sql); - $sth->execute($input_id, $prev_job_id, $analysis->dbID, $status, $semaphore_count, $semaphored_job_id); - my $job_id = $sth->{'mysql_insertid'}; + + $sth->execute($input_id, $prev_job_id, $analysis->dbID, $status, $semaphore_count || 0, $semaphored_job_id); + my $job_id = ($dbc->driver eq 'sqlite') ? $dbc->db_handle->func('last_insert_rowid') : $sth->{'mysql_insertid'}; $sth->finish; $dbc->do("UPDATE analysis_stats SET ". @@ -399,11 +403,11 @@ sub update_status { my $sql = "UPDATE job SET status='".$job->status."' "; if($job->status eq 'DONE') { - $sql .= ",completed=now()"; + $sql .= ",completed=CURRENT_TIMESTAMP"; $sql .= ",runtime_msec=".$job->runtime_msec; $sql .= ",query_count=".$job->query_count; } elsif($job->status eq 'PASSED_ON') { - $sql .= ", completed=now()"; + $sql .= ", completed=CURRENT_TIMESTAMP"; } elsif($job->status eq 'READY') { } @@ -450,7 +454,9 @@ sub store_out_files { $self->dbc->do($sql); return unless($job->stdout_file or $job->stderr_file); - $sql = "INSERT ignore INTO job_file (job_id, worker_id, retry, type, path) VALUES "; + my $insertion_method = ($self->dbc->driver eq 'sqlite') ? 'INSERT OR IGNORE' : 'INSERT IGNORE'; + + $sql = "$insertion_method INTO job_file (job_id, worker_id, retry, type, path) VALUES "; if($job->stdout_file) { $sql .= sprintf("(%d,%d,%d,'STDOUT','%s')", $job->dbID, $job->worker_id, $job->retry_count, $job->stdout_file); @@ -485,23 +491,22 @@ sub grab_jobs_for_worker { my $analysis_id = $worker->analysis->dbID(); my $worker_id = $worker->dbID(); + my $batch_size = $worker->batch_size(); - my $sql_base = qq{ - UPDATE job - SET worker_id='$worker_id', status='CLAIMED' - WHERE analysis_id='$analysis_id' AND status='READY' AND semaphore_count<=0 - }; + my $update_sql = "UPDATE job SET worker_id='$worker_id', status='CLAIMED'"; + my $selection_start_sql = " WHERE analysis_id='$analysis_id' AND status='READY' AND semaphore_count<=0"; - my $sql_virgin = $sql_base . - " AND retry_count=0". - " LIMIT " . $worker->batch_size; + my $virgin_selection_sql = $selection_start_sql . " AND retry_count=0 LIMIT $batch_size"; + my $any_selection_sql = $selection_start_sql . " LIMIT $batch_size"; - my $sql_any = $sql_base . - " LIMIT " . $worker->batch_size; - - my $claim_count = $self->dbc->do($sql_virgin); - if($claim_count == 0) { - $claim_count = $self->dbc->do($sql_any); + if($self->dbc->driver eq 'sqlite') { + unless(my $claim_count = $self->dbc->do( $update_sql . " WHERE job_id IN (SELECT job_id FROM job $virgin_selection_sql) AND status='READY'" )) { + $claim_count = $self->dbc->do( $update_sql . " WHERE job_id IN (SELECT job_id FROM job $any_selection_sql) AND status='READY'" ); + } + } else { + unless(my $claim_count = $self->dbc->do( $update_sql . $virgin_selection_sql )) { + $claim_count = $self->dbc->do( $update_sql . $any_selection_sql ); + } } my $constraint = "j.analysis_id='$analysis_id' AND j.worker_id='$worker_id' AND j.status='CLAIMED'"; @@ -607,7 +612,7 @@ sub release_and_age_job { # $self->dbc->do( qq{ UPDATE job - SET status=IF( $may_retry AND (retry_count<$max_retry_count), 'READY', 'FAILED'), retry_count=retry_count+1 + SET status=(CASE WHEN $may_retry AND (retry_count<$max_retry_count) THEN 'READY' ELSE 'FAILED' END), retry_count=retry_count+1 WHERE job_id=$job_id AND status in ('COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT') } ); diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisStatsAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisStatsAdaptor.pm index 9075de724..6ed787904 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisStatsAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/AnalysisStatsAdaptor.pm @@ -209,7 +209,7 @@ sub update { $sql .= ",failed_job_tolerance=" . $stats->failed_job_tolerance(); $sql .= ",num_running_workers=" . $stats->num_running_workers(); $sql .= ",num_required_workers=" . $stats->num_required_workers(); - $sql .= ",last_update=NOW()"; + $sql .= ",last_update=CURRENT_TIMESTAMP"; $sql .= ",sync_lock='0'"; $sql .= ",rc_id=". $stats->rc_id(); $sql .= ",can_be_empty=". $stats->can_be_empty(); @@ -218,7 +218,7 @@ sub update { my $sth = $self->prepare($sql); $sth->execute(); $sth->finish; - $sth = $self->prepare("INSERT INTO analysis_stats_monitor SELECT now(), analysis_stats.* from analysis_stats WHERE analysis_id = ".$stats->analysis_id); + $sth = $self->prepare("INSERT INTO analysis_stats_monitor SELECT CURRENT_TIMESTAMP, analysis_stats.* from analysis_stats WHERE analysis_id = ".$stats->analysis_id); $sth->execute(); $sth->finish; $stats->seconds_since_last_update(0); #not exact but good enough :) @@ -285,7 +285,7 @@ sub decrease_hive_capacity my $sql = "UPDATE analysis_stats ". " SET hive_capacity = hive_capacity - 1, ". - " num_required_workers = IF(num_required_workers > 0, num_required_workers - 1, 0) ". + " num_required_workers = (CASE WHEN num_required_workers > 0 THEN num_required_workers - 1 ELSE 0 END) ". " WHERE analysis_id='$analysis_id' and hive_capacity > 1"; $self->dbc->do($sql); @@ -469,7 +469,10 @@ sub _columns { ast.rc_id ast.can_be_empty ); - push @columns , "UNIX_TIMESTAMP()-UNIX_TIMESTAMP(ast.last_update) seconds_since_last_update "; + + push @columns , ($self->dbc->driver eq 'sqlite') + ? "strftime('%s','now')-strftime('%s',ast.last_update) seconds_since_last_update " + : "UNIX_TIMESTAMP()-UNIX_TIMESTAMP(ast.last_update) seconds_since_last_update "; return @columns; } @@ -534,7 +537,9 @@ sub _create_new_for_analysis_id { my $sql; - $sql = "INSERT ignore INTO analysis_stats (analysis_id) VALUES ($analysis_id)"; + my $insertion_method = ($self->dbc->driver eq 'sqlite') ? 'INSERT OR IGNORE' : 'INSERT IGNORE'; + + $sql = "$insertion_method INTO analysis_stats (analysis_id) VALUES ($analysis_id)"; #print("$sql\n"); my $sth = $self->prepare($sql); $sth->execute(); diff --git a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm index 69d4363ad..e2de2d04f 100644 --- a/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm +++ b/modules/Bio/EnsEMBL/Hive/DBSQL/BaseAdaptor.pm @@ -90,26 +90,37 @@ sub _table_info_loader { my $dbc = $self->dbc(); my $dbname = $dbc->dbname(); + my $driver = $dbc->driver(); my $table_name = $self->table_name(); my %column_set = (); my @primary_key = (); my $autoinc_id = ''; - my $sql = "SELECT column_name,column_key,extra FROM information_schema.columns WHERE table_schema='$dbname' and table_name='$table_name'"; + my $sql = { + 'mysql' => "SELECT column_name AS name, column_key='PRI' AS pk, extra='auto_increment' AS ai FROM information_schema.columns WHERE table_schema='$dbname' and table_name='$table_name'", + 'sqlite'=> "PRAGMA table_info('$table_name')", + }->{$driver} or die "could not find column info for driver='$driver'"; + my $sth = $self->prepare($sql); - $sth->execute; - while(my ($column_name, $column_key, $extra) = $sth->fetchrow ) { + $sth->execute; + while(my $row = $sth->fetchrow_hashref ) { + my $column_name = $row->{'name'}; + $column_set{$column_name} = 1; - if($column_key eq 'PRI') { + if($row->{'pk'}) { push @primary_key, $column_name; - if($extra eq 'auto_increment') { + if($row->{'ai'}) { $autoinc_id = $column_name; } } } $sth->finish; + if(($driver eq 'sqlite') and scalar(@primary_key)==1) { + $autoinc_id = $primary_key[0]; + } + $self->column_set( \%column_set ); $self->primary_key( \@primary_key ); $self->autoinc_id( $autoinc_id ); @@ -275,8 +286,12 @@ sub store { my $table_name = $self->table_name(); my $column_set = $self->column_set(); my $autoinc_id = $self->autoinc_id(); + my $driver = $self->dbc->driver(); my $insertion_method = $self->insertion_method; # INSERT, INSERT_IGNORE or REPLACE $insertion_method =~ s/_/ /g; + if($driver eq 'sqlite') { + $insertion_method =~ s/INSERT IGNORE/INSERT OR IGNORE/ig; + } # NB: here we assume all hashes will have the same keys: my $non_autoinc_columns = [ grep { $_ ne $autoinc_id } keys %$column_set ]; @@ -289,7 +304,7 @@ sub store { if($check_presence_in_db_first and my $present = $self->check_object_present_in_db($object)) { $self->mark_stored($object, $present); } else { - #print "STORE: $sql\n"; + # print "STORE: $sql\n"; $sth ||= $self->prepare( $sql ); # only prepare (once) if we get here #print "NON_AUTOINC_COLUMNS: ".join(', ', @$non_autoinc_columns)."\n"; @@ -300,7 +315,7 @@ sub store { # using $return_code in boolean context allows to skip the value '0E0' ('no rows affected') that Perl treats as zero but regards as true: or die "Could not perform\n\t$sql\nwith data:\n\t(".join(',', @$non_autoinc_values).')'; if($return_code > 0) { # <--- for the same reason we have to be expliticly numeric here - $self->mark_stored($object, $sth->{'mysql_insertid'}); + $self->mark_stored($object, ($driver eq 'sqlite') ? $self->dbc->db_handle->func('last_insert_rowid') : $sth->{'mysql_insertid'}); } } } diff --git a/modules/Bio/EnsEMBL/Hive/Extensions.pm b/modules/Bio/EnsEMBL/Hive/Extensions.pm index 535c79250..d0f4baefe 100755 --- a/modules/Bio/EnsEMBL/Hive/Extensions.pm +++ b/modules/Bio/EnsEMBL/Hive/Extensions.pm @@ -90,17 +90,26 @@ sub Bio::EnsEMBL::Analysis::process { =cut -sub Bio::EnsEMBL::DBSQL::DBConnection::url -{ +sub Bio::EnsEMBL::DBSQL::DBConnection::url { my $self = shift; - return undef unless($self->host and $self->port and $self->dbname); - my $url = "mysql://"; + + return undef unless($self->driver and $self->dbname); + + my $url = $self->driver . '://'; + if($self->username) { $url .= $self->username; $url .= ":".$self->password if($self->password); $url .= "@"; } - $url .= $self->host .":". $self->port ."/" . $self->dbname; + if($self->host) { + $url .= $self->host; + if($self->port) { + $url .= ':'.$self->port; + } + } + $url .= '/' . $self->dbname; + return $url; } @@ -118,15 +127,12 @@ sub Bio::EnsEMBL::DBSQL::DBConnection::url =cut -sub Bio::EnsEMBL::Analysis::url -{ +sub Bio::EnsEMBL::Analysis::url { my $self = shift; - my $url; return undef unless($self->adaptor); - $url = $self->adaptor->db->dbc->url; - $url .= "/analysis?logic_name=" . $self->logic_name; - return $url; + + return $self->adaptor->db->dbc->url . '/analysis?logic_name=' . $self->logic_name; } diff --git a/modules/Bio/EnsEMBL/Hive/NakedTable.pm b/modules/Bio/EnsEMBL/Hive/NakedTable.pm index a080b092d..fb40d4df1 100644 --- a/modules/Bio/EnsEMBL/Hive/NakedTable.pm +++ b/modules/Bio/EnsEMBL/Hive/NakedTable.pm @@ -69,7 +69,7 @@ sub url { my $ref_dba = shift @_; # if reference dba is the same as 'our' dba, a shorter url can be generated if(my $adaptor = $self->adaptor) { - my $conn_prefix = ($adaptor->db == $ref_dba) ? 'mysql:///' : $adaptor->db->dbc->url(); + my $conn_prefix = ($adaptor->db == $ref_dba) ? ':///' : $adaptor->db->dbc->url(); return $conn_prefix .'/'. $self->table_name() . '?insertion_method=' . $self->insertion_method(); } else { return; diff --git a/modules/Bio/EnsEMBL/Hive/PipeConfig/HiveGeneric_conf.pm b/modules/Bio/EnsEMBL/Hive/PipeConfig/HiveGeneric_conf.pm index 636b9eabb..6ce2aa7a3 100644 --- a/modules/Bio/EnsEMBL/Hive/PipeConfig/HiveGeneric_conf.pm +++ b/modules/Bio/EnsEMBL/Hive/PipeConfig/HiveGeneric_conf.pm @@ -52,8 +52,12 @@ use Bio::EnsEMBL::Hive::DBSQL::DBAdaptor; use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor; use Bio::EnsEMBL::Hive::Extensions; +our ($hive_default_driver) = 'mysql'; + + # ---------------------------[the following methods will be overridden by specific pipelines]------------------------- + =head2 default_options Description : Interface method that should return a hash of option_name->default_option_value pairs. @@ -74,10 +78,12 @@ sub default_options { -user => 'ensadmin', -pass => $self->o('password'), -dbname => $ENV{'USER'}.'_'.$self->o('pipeline_name'), # example of a linked definition (resolved via saturation) + -driver => $hive_default_driver, }, }; } + =head2 pipeline_create_commands Description : Interface method that should return a list of command lines to be run in order to create and set up the pipeline database. @@ -86,17 +92,24 @@ sub default_options { =cut sub pipeline_create_commands { - my ($self) = @_; - return [ - 'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + my $self = shift @_; - # standard eHive tables, foreign_keys and procedures: - 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', - 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/foreign_keys.sql', - 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1).' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.sql', - ]; + return ($hive_default_driver eq 'sqlite') + ? [ + # standard eHive tables and unique/non-unique indices: + $self->db_connect_command('pipeline_db').' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sqlite', + ] + : [ + 'mysql '.$self->dbconn_2_mysql('pipeline_db', 0)." -e 'CREATE DATABASE ".$self->o('pipeline_db', '-dbname')."'", + + # standard eHive tables, foreign_keys and procedures: + $self->db_connect_command('pipeline_db').' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/tables.sql', + $self->db_connect_command('pipeline_db').' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/foreign_keys.sql', + $self->db_connect_command('pipeline_db').' <'.$self->o('ensembl_cvs_root_dir').'/ensembl-hive/sql/procedures.sql', + ]; } + =head2 pipeline_wide_parameters Description : Interface method that should return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs. @@ -112,6 +125,7 @@ sub pipeline_wide_parameters { }; } + =head2 resource_classes Description : Interface method that should return a hash of resource_description_id->resource_description_hash. @@ -127,6 +141,7 @@ sub resource_classes { }; } + =head2 pipeline_analyses Description : Interface method that should return a list of hashes that define analysis bundled with corresponding jobs, dataflow and analysis_ctrl rules and resource_id. @@ -160,6 +175,7 @@ sub new { return $self; } + =head2 o Description : This is the method you call in the interface methods when you need to substitute an option: $self->o('password') . @@ -187,6 +203,7 @@ sub o { # descends the option hash structure (vivifying all enco return $value; } + =head2 dbconn_2_mysql Description : A convenience method used to stringify a connection-parameters hash into a parameter string that both mysql and beekeeper.pl can understand @@ -203,6 +220,37 @@ sub dbconn_2_mysql { # will save you a lot of typing .($with_db ? ($self->o($db_conn,'-dbname').' ') : ''); } + +=head2 db_connect_command + + Description : A convenience method used to stringify a command to connect to the db OR pipe an sql file into it. + +=cut + +sub db_connect_command { + my ($self, $db_conn) = @_; + + return ($hive_default_driver eq 'sqlite') + ? 'sqlite3 '.$self->o($db_conn, '-dbname') + : 'mysql '.$self->dbconn_2_mysql($db_conn, 1); +} + + +=head2 db_execute_command + + Description : A convenience method used to stringify a command to connect to the db OR pipe an sql file into it. + +=cut + +sub db_execute_command { + my ($self, $db_conn, $sql_command) = @_; + + return ($hive_default_driver eq 'sqlite') + ? 'sqlite3 '.$self->o($db_conn, '-dbname')." '$sql_command'" + : 'mysql '.$self->dbconn_2_mysql($db_conn, 1)." -e '$sql_command'"; +} + + =head2 dbconn_2_url Description : A convenience method used to stringify a connection-parameters hash into a 'url' that beekeeper.pl will undestand @@ -212,9 +260,12 @@ sub dbconn_2_mysql { # will save you a lot of typing sub dbconn_2_url { my ($self, $db_conn) = @_; - return 'mysql://'.$self->o($db_conn,'-user').':'.$self->o($db_conn,'-pass').'@'.$self->o($db_conn,'-host').':'.$self->o($db_conn,'-port').'/'.$self->o($db_conn,'-dbname'); + return ($hive_default_driver eq 'sqlite') + ? $self->o($db_conn, '-driver').':///'.$self->o($db_conn,'-dbname') + : $self->o($db_conn, '-driver').'://'.$self->o($db_conn,'-user').':'.$self->o($db_conn,'-pass').'@'.$self->o($db_conn,'-host').':'.$self->o($db_conn,'-port').'/'.$self->o($db_conn,'-dbname'); } + =head2 process_options Description : The method that does all the parameter parsing magic. @@ -460,7 +511,7 @@ sub run { print " beekeeper.pl -url $url -run\t\t# (run one step of the pipeline - useful for debugging/learning)\n"; print "\n\n\tTo connect to your pipeline database use the following line:\n\n"; - print " mysql ".$self->dbconn_2_mysql('pipeline_db',1)."\n\n"; + print " ".$self->db_connect_command('pipeline_db')."\n\n"; } diff --git a/modules/Bio/EnsEMBL/Hive/PipeConfig/LongMult_conf.pm b/modules/Bio/EnsEMBL/Hive/PipeConfig/LongMult_conf.pm index dc53fb4a1..5ccfa30c3 100644 --- a/modules/Bio/EnsEMBL/Hive/PipeConfig/LongMult_conf.pm +++ b/modules/Bio/EnsEMBL/Hive/PipeConfig/LongMult_conf.pm @@ -57,6 +57,11 @@ use warnings; use base ('Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf'); # All Hive databases configuration files should inherit from HiveGeneric, directly or indirectly + + # EXPERIMENTAL: choose either 'mysql' or 'sqlite' and do not forget to provide the correct connection parameters: +$Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf::hive_default_driver = 'mysql'; + + =head2 default_options Description : Implements default_options() interface method of Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf that is used to initialize default options. @@ -72,6 +77,7 @@ sub default_options { 'pipeline_name' => 'long_mult', # name used by the beekeeper to prefix job names on the farm 'pipeline_db' => { # connection parameters + -driver => $Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf::hive_default_driver, -host => 'compara2', -port => 3306, -user => 'ensadmin', @@ -98,8 +104,8 @@ sub pipeline_create_commands { @{$self->SUPER::pipeline_create_commands}, # inheriting database and hive tables' creation # additional tables needed for long multiplication pipeline's operation: - 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE intermediate_result (a_multiplier char(40) NOT NULL, digit tinyint NOT NULL, result char(41) NOT NULL, PRIMARY KEY (a_multiplier, digit))'", - 'mysql '.$self->dbconn_2_mysql('pipeline_db', 1)." -e 'CREATE TABLE final_result (a_multiplier char(40) NOT NULL, b_multiplier char(40) NOT NULL, result char(80) NOT NULL, PRIMARY KEY (a_multiplier, b_multiplier))'", + $self->db_execute_command('pipeline_db', 'CREATE TABLE intermediate_result (a_multiplier char(40) NOT NULL, digit tinyint NOT NULL, result char(41) NOT NULL, PRIMARY KEY (a_multiplier, digit))'), + $self->db_execute_command('pipeline_db', 'CREATE TABLE final_result (a_multiplier char(40) NOT NULL, b_multiplier char(40) NOT NULL, result char(80) NOT NULL, PRIMARY KEY (a_multiplier, b_multiplier))'), ]; } @@ -143,7 +149,7 @@ sub pipeline_analyses { # (jobs for this analysis will be flown_into via branch-2 from 'start' jobs above) ], -flow_into => { - 'MAIN' => [ 'mysql:////intermediate_result' ], + 'MAIN' => [ ':////intermediate_result' ], }, }, @@ -155,7 +161,7 @@ sub pipeline_analyses { ], -wait_for => [ 'part_multiply' ], # we can only start adding when all partial products have been computed -flow_into => { - 'MAIN' => [ 'mysql:////final_result' ], + 'MAIN' => [ ':////final_result' ], }, }, ]; diff --git a/modules/Bio/EnsEMBL/Hive/Queen.pm b/modules/Bio/EnsEMBL/Hive/Queen.pm index f8b48707b..2fa5f3c3e 100755 --- a/modules/Bio/EnsEMBL/Hive/Queen.pm +++ b/modules/Bio/EnsEMBL/Hive/Queen.pm @@ -182,11 +182,11 @@ sub create_new_worker { my $sql = q{INSERT INTO worker (born, last_check_in, meadow_type, process_id, host, analysis_id) - VALUES (NOW(), NOW(), ?,?,?,?)}; + VALUES (CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, ?,?,?,?)}; my $sth = $self->prepare($sql); $sth->execute($meadow_type, $process_id, $exec_host, $analysisStats->analysis_id); - my $worker_id = $sth->{'mysql_insertid'}; + my $worker_id = ($self->dbc->driver eq 'sqlite') ? $self->dbc->db_handle->func('last_insert_rowid') : $sth->{'mysql_insertid'}; $sth->finish; my $worker = $self->fetch_by_dbID($worker_id); @@ -242,7 +242,7 @@ sub register_worker_death { $worker->analysis->stats->adaptor->decrease_running_workers($worker->analysis->stats->analysis_id); } - my $sql = "UPDATE worker SET died=now(), last_check_in=now()"; + my $sql = "UPDATE worker SET died=CURRENT_TIMESTAMP, last_check_in=CURRENT_TIMESTAMP"; $sql .= " ,status='DEAD'"; $sql .= " ,work_done='" . $worker->work_done . "'"; $sql .= " ,cause_of_death='$cod'"; @@ -332,7 +332,7 @@ sub worker_check_in { my ($self, $worker) = @_; return unless($worker); - my $sql = "UPDATE worker SET last_check_in=now()"; + my $sql = "UPDATE worker SET last_check_in=CURRENT_TIMESTAMP"; $sql .= " ,work_done='" . $worker->work_done . "'"; $sql .= " WHERE worker_id='" . $worker->dbID ."'"; @@ -391,8 +391,10 @@ sub fetch_overdue_workers { $overdue_secs = 3600 unless(defined($overdue_secs)); - my $constraint = "w.cause_of_death='' ". - "AND (UNIX_TIMESTAMP()-UNIX_TIMESTAMP(w.last_check_in))>$overdue_secs"; + my $constraint = "w.cause_of_death='' AND ". + ( ($self->dbc->driver eq 'sqlite') + ? "(strftime('%s','now')-strftime('%s',w.last_check_in))>$overdue_secs" + : "(UNIX_TIMESTAMP()-UNIX_TIMESTAMP(w.last_check_in))>$overdue_secs"); return $self->_generic_fetch($constraint); } @@ -803,10 +805,10 @@ sub monitor my $sql = qq{ INSERT INTO monitor SELECT - now(), + CURRENT_TIMESTAMP, count(*), - sum(work_done/TIME_TO_SEC(TIMEDIFF(now(),born))), - sum(work_done/TIME_TO_SEC(TIMEDIFF(now(),born)))/count(*), + sum(work_done/TIME_TO_SEC(TIMEDIFF(CURRENT_TIMESTAMP,born))), + sum(work_done/TIME_TO_SEC(TIMEDIFF(CURRENT_TIMESTAMP,born)))/count(*), group_concat(DISTINCT logic_name) FROM worker left join analysis USING (analysis_id) WHERE cause_of_death = ""}; diff --git a/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm b/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm index 157b5f79f..c4b8c6ecf 100644 --- a/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm +++ b/modules/Bio/EnsEMBL/Hive/RunnableDB/JobFactory.pm @@ -179,7 +179,9 @@ sub _make_list_from_query { my $dbc; if(my $db_conn = $self->param('db_conn')) { - $dbc = DBI->connect("DBI:mysql:$db_conn->{-dbname}:$db_conn->{-host}:$db_conn->{-port}", $db_conn->{-user}, $db_conn->{-pass}, { RaiseError => 1 }); + $db_conn->{-driver} ||= 'mysql'; + + $dbc = DBI->connect("DBI:$db_conn->{-driver}:$db_conn->{-dbname}:$db_conn->{-host}:$db_conn->{-port}", $db_conn->{-user}, $db_conn->{-pass}, { RaiseError => 1 }); } else { $dbc = $self->db->dbc; } diff --git a/modules/Bio/EnsEMBL/Hive/RunnableDB/SqlCmd.pm b/modules/Bio/EnsEMBL/Hive/RunnableDB/SqlCmd.pm index 42630dc33..5d4d3b5bf 100755 --- a/modules/Bio/EnsEMBL/Hive/RunnableDB/SqlCmd.pm +++ b/modules/Bio/EnsEMBL/Hive/RunnableDB/SqlCmd.pm @@ -86,8 +86,9 @@ sub fetch_input { # Use connection parameters to another database if supplied, otherwise use the current database as default: # if(my $db_conn = $self->param('db_conn')) { + $db_conn->{-driver} ||= 'mysql'; - $self->param('dbh', DBI->connect("DBI:mysql:$db_conn->{-dbname}:$db_conn->{-host}:$db_conn->{-port}", $db_conn->{-user}, $db_conn->{-pass}, { RaiseError => 1 }) ); + $self->param('dbh', DBI->connect("DBI:$db_conn->{-driver}:$db_conn->{-dbname}:$db_conn->{-host}:$db_conn->{-port}", $db_conn->{-user}, $db_conn->{-pass}, { RaiseError => 1 }) ); } else { $self->param('dbh', $self->db->dbc->db_handle ); } @@ -119,7 +120,7 @@ sub run { $dbh->do( $sql ); my $insert_id_name = '_insert_id_'.$counter++; - my $insert_id_value = $dbh->{'mysql_insertid'}; + my $insert_id_value = ($dbh->driver eq 'sqlite') ? $dbh->func('last_insert_rowid') : $dbh->{'mysql_insertid'}; $output_id{$insert_id_name} = $insert_id_value; $self->param($insert_id_name, $insert_id_value); # for templates } diff --git a/modules/Bio/EnsEMBL/Hive/URLFactory.pm b/modules/Bio/EnsEMBL/Hive/URLFactory.pm index 6f5df42e8..67a06c3ae 100755 --- a/modules/Bio/EnsEMBL/Hive/URLFactory.pm +++ b/modules/Bio/EnsEMBL/Hive/URLFactory.pm @@ -89,15 +89,15 @@ sub fetch { Bio::EnsEMBL::Hive::URLFactory->new(); # make sure global instance is created - if( my ($conn, $user, $pass, $host, $port, $dbname, $table_name, $tparam_name, $tparam_value, $conn_param_string) = - $url =~ m{^(mysql://(?:(\w+)(?:\:([^/\@]*))?\@)?(?:([\w\-\.]+)(?:\:(\d+))?)?/(\w*))(?:/(\w+)(?:\?(\w+)=(\w+))?)?((?:;(\w+)=(\w+))*)$} ) { + if( my ($conn, $driver, $user, $pass, $host, $port, $dbname, $table_name, $tparam_name, $tparam_value, $conn_param_string) = + $url =~ m{^((\w*)://(?:(\w+)(?:\:([^/\@]*))?\@)?(?:([\w\-\.]+)(?:\:(\d+))?)?/(\w*))(?:/(\w+)(?:\?(\w+)=(\w+))?)?((?:;(\w+)=(\w+))*)$} ) { my %conn_param = split(/[;=]/, 'type=hive;discon=0'.$conn_param_string ); -#warn "URLPARSER: conn='$conn', user='$user', pass='$pass', host='$host', port='$port', dbname='$dbname', table_name='$table_name', tparam_name='$tparam_name', tparam_value='$tparam_value'"; +#warn "URLPARSER: conn='$conn', driver='$driver', user='$user', pass='$pass', host='$host', port='$port', dbname='$dbname', table_name='$table_name', tparam_name='$tparam_name', tparam_value='$tparam_value'"; #warn "CONN_PARAMS: ".Dumper(\%conn_param); - my $dba = ($conn eq 'mysql:///') ? $default_dba : $class->create_cached_dba($user, $pass, $host, $port, $dbname, %conn_param); + my $dba = ($conn =~ m{^\w*:///$} ) ? $default_dba : $class->create_cached_dba($driver, $user, $pass, $host, $port, $dbname, %conn_param); if(not $table_name) { @@ -124,18 +124,19 @@ sub fetch { } sub create_cached_dba { - my $class = shift @_; - my $user = shift @_ || 'ensro'; - my $pass = shift @_ || ''; - my $host = shift @_ || ''; - my $port = shift @_ || 3306; - my $dbname = shift @_; - my %conn_param = @_; + my ($class, $driver, $user, $pass, $host, $port, $dbname, %conn_param) = @_; + + if($driver eq 'mysql') { + $user ||= 'ensro'; + $pass ||= ''; + $host ||= ''; + $port ||= 3306; + } my $type = $conn_param{'type'}; my $discon = $conn_param{'discon'}; - my $connectionKey = "$user:$pass\@$host:$port/$dbname;$type"; + my $connectionKey = "$driver://$user:$pass\@$host:$port/$dbname;$type"; my $dba = $_URLFactory_global_instance->{$connectionKey}; unless($dba) { @@ -150,13 +151,14 @@ sub create_cached_dba { eval "require $module"; $_URLFactory_global_instance->{$connectionKey} = $dba = $module->new ( - -disconnect_when_inactive => $discon, - -user => $user, - -pass => $pass, + -driver => $driver, -host => $host, -port => $port, + -user => $user, + -pass => $pass, -dbname => $dbname, -species => $dbname, + -disconnect_when_inactive => $discon, ); } return $dba; diff --git a/sql/tables.sqlite b/sql/tables.sqlite new file mode 100644 index 000000000..812203168 --- /dev/null +++ b/sql/tables.sqlite @@ -0,0 +1,408 @@ + +-- The first 3 tables are from the ensembl core schema: meta, analysis and analysis_description. +-- We create them with the 'IF NOT EXISTS' option in case they already exist in the DB. + +-- ---------------------------------------------------------------------------------------------- +-- +-- Table structure for table 'meta' (FROM THE CORE SCHEMA) +-- + +CREATE TABLE IF NOT EXISTS meta ( + meta_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + species_id INT UNSIGNED DEFAULT 1, + meta_key VARCHAR(40) NOT NULL, + meta_value TEXT NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS species_key_value_idx ON meta (species_id, meta_key, meta_value); +CREATE INDEX IF NOT EXISTS species_value_idx ON meta (species_id, meta_value); + + + +-- ---------------------------------------------------------------------------------------------- +-- +-- Table structure for table 'analysis' (FROM THE CORE SCHEMA) +-- +-- semantics: +-- +-- analysis_id - internal id +-- created +-- - date to distinguish newer and older versions off the same analysis. Not +-- well maintained so far. +-- logic_name - string to identify the analysis. Used mainly inside pipeline. +-- db, db_version, db_file +-- - db should be a database name, db version the version of that db +-- db_file the file system location of that database, +-- probably wiser to generate from just db and configurations +-- program, program_version,program_file +-- - The binary used to create a feature. Similar semantic to above +-- module, module_version +-- - Perl module names (RunnableDBS usually) executing this analysis. +-- parameters - a paramter string which is processed by the perl module +-- gff_source, gff_feature +-- - how to make a gff dump from features with this analysis + + +CREATE TABLE IF NOT EXISTS analysis ( + analysis_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + created datetime DEFAULT '0000-00-00 00:00:00' NOT NULL, + logic_name VARCHAR(40) NOT NULL, + db VARCHAR(120), + db_version VARCHAR(40), + db_file VARCHAR(255), + program VARCHAR(255), + program_version VARCHAR(40), + program_file VARCHAR(255), + parameters TEXT, + module VARCHAR(255), + module_version VARCHAR(40), + gff_source VARCHAR(40), + gff_feature VARCHAR(40) +); +CREATE UNIQUE INDEX IF NOT EXISTS logic_name_idx ON analysis (logic_name); + + +-- ---------------------------------------------------------------------------------------------- +-- +-- Table structure for table 'analysis_description' (FROM THE CORE SCHEMA) +-- + +CREATE TABLE IF NOT EXISTS analysis_description ( + analysis_id INTEGER NOT NULL PRIMARY KEY, + description TEXT, + display_label VARCHAR(255), + displayable BOOLEAN NOT NULL DEFAULT 1, + web_data TEXT +); +CREATE UNIQUE INDEX IF NOT EXISTS analysis_idx ON analysis_description (analysis_id); + + + +-- ------------------- now, to the 'proper' Hive tables: ---------------------------------------- + + + +-- ---------------------------------------------------------------------------------------------- +-- +-- Table structure for table 'worker' +-- +-- overview: +-- Table which tracks the workers of a hive as they exist out in the world. +-- Workers are created by inserting into this table so that there is only every +-- one instance of a worker object in the world. As workers live and do work, +-- they update this table, and when they die they update. +-- +-- semantics: +-- + +CREATE TABLE worker ( + worker_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + analysis_id INTEGER NOT NULL, + meadow_type TEXT NOT NULL, /* enum('LSF', 'LOCAL') NOT NULL, */ + host varchar(40) DEFAULT NULL, + process_id varchar(40) DEFAULT NULL, + work_done int(11) DEFAULT '0' NOT NULL, + status TEXT DEFAULT 'READY' NOT NULL, /* enum('READY','COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT','DEAD') DEFAULT 'READY' NOT NULL, */ + born timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_check_in datetime NOT NULL, + died datetime DEFAULT NULL, + cause_of_death TEXT DEFAULT '' NOT NULL /* enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'FATALITY') DEFAULT '' NOT NULL */ +); +CREATE INDEX IF NOT EXISTS analysis_id_status_idx ON worker (analysis_id, status); + + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'dataflow_rule' +-- +-- overview: +-- Extension of simple_rule design except that goal(to) is now in extended URL format e.g. +-- mysql://ensadmin:<pass>@ecs2:3361/compara_hive_test?analysis.logic_name='blast_NCBI34' +-- (full network address of an analysis). The only requirement is that there are rows in +-- the job, analysis, dataflow_rule, and worker tables so that the following join +-- works on the same database +-- WHERE analysis.analysis_id = dataflow_rule.from_analysis_id +-- AND analysis.analysis_id = job.analysis_id +-- AND analysis.analysis_id = worker.analysis_id +-- +-- These are the rules used to create entries in the job table where the +-- input_id (control data) is passed from one analysis to the next to define work. +-- +-- The analysis table will be extended so that it can specify different read and write +-- databases, with the default being the database the analysis is on +-- +-- semantics: +-- dataflow_rule_id - internal ID +-- from_analysis_id - foreign key to analysis table analysis_id +-- to_analysis_url - foreign key to net distributed analysis logic_name reference +-- branch_code - joined to job.branch_code to allow branching +-- input_id_template - a template for generating a new input_id (not necessarily a hashref) in this dataflow; if undefined is kept original + +CREATE TABLE dataflow_rule ( + dataflow_rule_id INTEGER PRIMARY KEY AUTOINCREMENT, + from_analysis_id INTEGER NOT NULL, + to_analysis_url varchar(255) default '' NOT NULL, + branch_code int(10) default 1 NOT NULL, + input_id_template TEXT DEFAULT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS from_to_branch_template_idx ON dataflow_rule (from_analysis_id, to_analysis_url, branch_code, input_id_template); + + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'analysis_ctrl_rule' +-- +-- overview: +-- These rules define a higher level of control. These rules are used to turn +-- whole anlysis nodes on/off (READY/BLOCKED). +-- If any of the condition_analyses are not 'DONE' the ctrled_analysis is set BLOCKED +-- When all conditions become 'DONE' then ctrled_analysis is set to READY +-- The workers switch the analysis.status to 'WORKING' and 'DONE'. +-- But any moment if a condition goes false, the analysis is reset to BLOCKED. +-- +-- This process of watching conditions and flipping the ctrled_analysis state +-- will be accomplished by another automous agent (CtrlWatcher.pm) +-- +-- semantics: +-- condition_analysis_url - foreign key to net distributed analysis reference +-- ctrled_analysis_id - foreign key to analysis table analysis_id + +CREATE TABLE analysis_ctrl_rule ( + condition_analysis_url VARCHAR(255) DEFAULT '' NOT NULL, + ctrled_analysis_id UNSIGNED INTEGER NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS condition_ctrled_idx ON analysis_ctrl_rule (condition_analysis_url, ctrled_analysis_id); + + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'job' +-- +-- overview: +-- The job is the heart of this system. It is the kiosk or blackboard +-- where workers find things to do and then post work for other works to do. +-- These jobs are created prior to work being done, are claimed by workers, +-- are updated as the work is done, with a final update on completion. +-- +-- semantics: +-- job_id - autoincrement id +-- prev_job_id - previous job which created this one (and passed input_id) +-- analysis_id - the analysis_id needed to accomplish this job. +-- input_id - input data passed into Analysis:RunnableDB to control the work +-- worker_id - link to worker table to define which worker claimed this job +-- status - state the job is in +-- retry_count - number times job had to be reset when worker failed to run it +-- completed - datetime when job was completed +-- +-- semaphore_count - if this count is >0, the job is conditionally blocked (until this count drops to 0 or below). +-- Default=0 means "nothing is blocking me by default". +-- semaphored_job_id - the job_id of job S that is waiting for this job to decrease S's semaphore_count. +-- Default=NULL means "I'm not blocking anything by default". + +CREATE TABLE job ( + job_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + prev_job_id INTEGER DEFAULT NULL, /* the job that created this one using a dataflow rule */ + analysis_id INTEGER NOT NULL, + input_id char(255) NOT NULL, + worker_id INTEGER DEFAULT NULL, + status TEXT DEFAULT 'READY' NOT NULL, /* enum('READY','BLOCKED','CLAIMED','COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT','DONE','FAILED','PASSED_ON') DEFAULT 'READY' NOT NULL, */ + retry_count int(10) DEFAULT 0 NOT NULL, + completed datetime DEFAULT NULL, + runtime_msec int(10) DEFAULT NULL, + query_count int(10) DEFAULT NULL, + + semaphore_count INTEGER DEFAULT 0 NOT NULL, + semaphored_job_id INTEGER DEFAULT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS input_id_analysis_id_idx ON job (input_id, analysis_id); +CREATE INDEX IF NOT EXISTS analysis_status_sema_retry_idx ON job (analysis_id, status, semaphore_count, retry_count); +CREATE INDEX IF NOT EXISTS worker_idx ON job (worker_id); + + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'job_message' +-- +-- overview: +-- In case a job throws a message (via die/throw), this message is registered in this table. +-- It may or may not indicate that the job was unsuccessful via is_error flag. +-- +-- semantics: +-- job_id - the id of the job that threw the message +-- worker_id - the worker in charge of the job at the moment +-- time - when the message was thrown +-- retry - retry_count of the job when the message was thrown +-- status - of the job when the message was thrown +-- msg - string that contains the message +-- is_error - binary flag + +CREATE TABLE job_message ( + job_id INTEGER NOT NULL, + worker_id INTEGER NOT NULL, + time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + retry int(10) DEFAULT 0 NOT NULL, + status TEXT DEFAULT 'UNKNOWN', /* enum('UNKNOWN', 'COMPILATION', 'GET_INPUT', 'RUN', 'WRITE_OUTPUT') DEFAULT 'UNKNOWN', */ + msg TEXT, + is_error BOOLEAN, + + PRIMARY KEY (job_id, worker_id, time) +); +CREATE INDEX IF NOT EXISTS worker_idx ON job_message (worker_id); +CREATE INDEX IF NOT EXISTS job_idx ON job_message (job_id); + + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'job_file' +-- +-- overview: +-- Table which holds paths to files created by jobs +-- e.g. STDOUT STDERR, temp directory +-- or output data files created by the RunnableDB +-- There can only be one entry of a certain type for a given job +-- +-- semantics: +-- job_id - foreign key +-- worker_id - link to worker table to define which worker claimed this job +-- retry - copy of retry_count of job as it was run +-- type - type of file e.g. STDOUT, STDERR, TMPDIR, ... +-- path - path to file or directory + +CREATE TABLE job_file ( + job_id INTEGER NOT NULL, + worker_id INTEGER NOT NULL, + retry int(10) NOT NULL, + type varchar(16) NOT NULL default '', + path varchar(255) NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS job_worker_type_idx ON job_file (job_id, worker_id, type); +CREATE INDEX IF NOT EXISTS worker_idx ON job_file (worker_id); + + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'analysis_data' +-- +-- overview: +-- Table which holds LONGTEXT data for use by the analysis system. +-- This data is general purpose and it's up to each analysis to +-- determine how to use it +-- +-- semantics: +-- analysis_data_id - primary id +-- data - text blob which holds the data + +CREATE TABLE analysis_data ( + analysis_data_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT, + data longtext +); +CREATE INDEX IF NOT EXISTS data_idx ON analysis_data (data); + + +CREATE TABLE resource_description ( + rc_id INTEGER NOT NULL, + meadow_type TEXT, /* enum('LSF', 'LOCAL') DEFAULT 'LSF' NOT NULL, */ + parameters varchar(255) DEFAULT '' NOT NULL, + description varchar(255), + PRIMARY KEY(rc_id, meadow_type) +); + + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'analysis_stats' +-- +-- overview: +-- Parallel table to analysis which provides high level statistics on the +-- state of an analysis and it's jobs. Used to provide a fast overview, and to +-- provide final approval of 'DONE' which is used by the blocking rules to determine +-- when to unblock other analyses. Also provides +-- +-- semantics: +-- analysis_id - foreign key to analysis table +-- status - overview status of the jobs (cached state) +-- failed_job_tolerance - % of tolerated failed jobs +-- rc_id - resource class id (analyses are grouped into disjoint classes) + +CREATE TABLE analysis_stats ( + analysis_id INTEGER NOT NULL, + status TEXT DEFAULT 'READY' NOT NULL, /* enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE', 'FAILED') DEFAULT 'READY' NOT NULL, */ + batch_size int(10) default 1 NOT NULL, + avg_msec_per_job int(10) default 0 NOT NULL, + avg_input_msec_per_job int(10) default 0 NOT NULL, + avg_run_msec_per_job int(10) default 0 NOT NULL, + avg_output_msec_per_job int(10) default 0 NOT NULL, + hive_capacity int(10) default 1 NOT NULL, + behaviour TEXT DEFAULT 'STATIC' NOT NULL, /* enum('STATIC', 'DYNAMIC') DEFAULT 'STATIC' NOT NULL, */ + input_capacity int(10) NOT NULL DEFAULT 4, + output_capacity int(10) NOT NULL DEFAULT 4, + total_job_count int(10) NOT NULL DEFAULT 0, + unclaimed_job_count int(10) NOT NULL DEFAULT 0, + done_job_count int(10) NOT NULL DEFAULT 0, + max_retry_count int(10) NOT NULL DEFAULT 3, + failed_job_count int(10) NOT NULL DEFAULT 0, + failed_job_tolerance int(10) NOT NULL DEFAULT 0, + num_running_workers int(10) NOT NULL DEFAULT 0, + num_required_workers int(10) NOT NULL DEFAULT 0, + last_update datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + sync_lock int(10) NOT NULL DEFAULT 0, + rc_id INTEGER NOT NULL DEFAULT 0, + can_be_empty TINYINT UNSIGNED DEFAULT 0 NOT NULL +); +CREATE UNIQUE INDEX IF NOT EXISTS analysis_idx ON analysis_stats (analysis_id); + + +CREATE TABLE analysis_stats_monitor ( + time datetime NOT NULL default '0000-00-00 00:00:00', + analysis_id INTEGER NOT NULL, + status TEXT DEFAULT 'READY' NOT NULL, /* enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE', 'FAILED') DEFAULT 'READY' NOT NULL, */ + batch_size int(10) default 1 NOT NULL, + avg_msec_per_job int(10) default 0 NOT NULL, + avg_input_msec_per_job int(10) default 0 NOT NULL, + avg_run_msec_per_job int(10) default 0 NOT NULL, + avg_output_msec_per_job int(10) default 0 NOT NULL, + hive_capacity int(10) default 1 NOT NULL, + behaviour TEXT DEFAULT 'STATIC' NOT NULL, /* enum('STATIC', 'DYNAMIC') DEFAULT 'STATIC' NOT NULL, */ + input_capacity int(10) default 4 NOT NULL, + output_capacity int(10) default 4 NOT NULL, + total_job_count int(10) NOT NULL DEFAULT 0, + unclaimed_job_count int(10) NOT NULL DEFAULT 0, + done_job_count int(10) NOT NULL DEFAULT 0, + max_retry_count int(10) default 3 NOT NULL, + failed_job_count int(10) NOT NULL DEFAULT 0, + failed_job_tolerance int(10) default 0 NOT NULL, + num_running_workers int(10) default 0 NOT NULL, + num_required_workers int(10) NOT NULL DEFAULT 0, + last_update datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + sync_lock int(10) default 0 NOT NULL, + rc_id INTEGER DEFAULT 0 NOT NULL, + can_be_empty TINYINT UNSIGNED DEFAULT 0 NOT NULL +); + +-- --------------------------------------------------------------------------------- +-- +-- Table structure for table 'monitor' +-- +-- overview: +-- This table stores information about hive performance. +-- +-- semantics: +-- time - datetime +-- workers - number of running workers +-- throughput - average numb of completed jobs per sec. of the hive +-- (this number is calculated using running workers only) +-- per_worker - average numb of completed jobs per sec. per worker +-- (this number is calculated using running workers only) +-- analysis - analysis(es) running at that time + +CREATE TABLE monitor ( + time datetime NOT NULL default '0000-00-00 00:00:00', + workers int(10) NOT NULL default '0', + throughput float default NULL, + per_worker float default NULL, + analysis varchar(255) default NULL /* not just one, but a list of logic_names */ +); + + +-- Auto add schema version to database (should be overridden by Compara's table.sql) +INSERT OR IGNORE INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'schema_version', '62'); + -- GitLab