tables.sql 15.1 KB
Newer Older
Leo Gordon's avatar
Leo Gordon committed
1 2

-- ----------------------------------------------------------------------------------
Jessica Severin's avatar
Jessica Severin committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
--
-- Table structure for table 'hive'
--
-- overview:
--   Table which tracks the workers of a hive as they exist out in the world.
--   Workers are created by inserting into this table so that there is only every
--   one instance of a worker object in the world.  As workers live and do work,
--   they update this table, and when they die they update.
--
-- semantics:
--

CREATE TABLE hive (
  hive_id          int(10) NOT NULL auto_increment,
  analysis_id      int(10) NOT NULL,
Abel Ureta-Vidal's avatar
Abel Ureta-Vidal committed
18
  beekeeper        varchar(80) DEFAULT '' NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
19
  host	           varchar(40) DEFAULT '' NOT NULL,
20
  process_id       varchar(40) DEFAULT '' NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
21
  work_done        int(11) DEFAULT '0' NOT NULL,
Leo Gordon's avatar
Leo Gordon committed
22
  status           enum('READY','GET_INPUT','RUN','WRITE_OUTPUT','DEAD') DEFAULT 'READY' NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
23 24 25
  born	           datetime NOT NULL,
  last_check_in    datetime NOT NULL,
  died             datetime DEFAULT NULL,
26 27
  cause_of_death   enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'FATALITY') DEFAULT '' NOT NULL,
  PRIMARY KEY (hive_id),
28
  INDEX analysis_status (analysis_id, status)
Jessica Severin's avatar
Jessica Severin committed
29 30 31
);


Leo Gordon's avatar
Leo Gordon committed
32
-- ---------------------------------------------------------------------------------
Jessica Severin's avatar
Jessica Severin committed
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
--
-- Table structure for table 'dataflow_rule'
--
-- overview:
--   Extension of simple_rule design except that goal(to) is now in extended URL format e.g.
--   mysql://ensadmin:<pass>@ecs2:3361/compara_hive_test?analysis.logic_name='blast_NCBI34'
--   (full network address of an analysis).  The only requirement is that there are rows in 
--   the analysis_job, analysis, dataflow_rule, and hive tables so that the following join
--   works on the same database 
--   WHERE analysis.analysis_id = dataflow_rule.from_analysis_id 
--   AND   analysis.analysis_id = analysis_job.analysis_id
--   AND   analysis.analysis_id = hive.analysis_id
--
--   These are the rules used to create entries in the analysis_job table where the
--   input_id (control data) is passed from one analysis to the next to define work.
--  
--   The analysis table will be extended so that it can specify different read and write
--   databases, with the default being the database the analysis is on
--
-- semantics:
53
--   dataflow_rule_id     - internal ID
Jessica Severin's avatar
Jessica Severin committed
54 55 56 57 58
--   from_analysis_id     - foreign key to analysis table analysis_id
--   to_analysis_url      - foreign key to net distributed analysis logic_name reference
--   branch_code          - joined to analysis_job.branch_code to allow branching

CREATE TABLE dataflow_rule (
59
  dataflow_rule_id    int(10) unsigned not null auto_increment,
Jessica Severin's avatar
Jessica Severin committed
60 61 62 63
  from_analysis_id    int(10) unsigned NOT NULL,
  to_analysis_url     varchar(255) default '' NOT NULL,
  branch_code         int(10) default 1 NOT NULL,

64
  PRIMARY KEY (dataflow_rule_id),
Jessica Severin's avatar
Jessica Severin committed
65 66 67 68
  UNIQUE (from_analysis_id, to_analysis_url)
);


Leo Gordon's avatar
Leo Gordon committed
69
-- ---------------------------------------------------------------------------------
Jessica Severin's avatar
Jessica Severin committed
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
--
-- Table structure for table 'analysis_ctrl_rule'
--
-- overview:
--   These rules define a higher level of control.  These rules are used to turn
--   whole anlysis nodes on/off (READY/BLOCKED).
--   If any of the condition_analyses are not 'DONE' the ctrled_analysis is set BLOCKED
--   When all conditions become 'DONE' then ctrled_analysis is set to READY
--   The workers switch the analysis.status to 'WORKING' and 'DONE'.
--   But any moment if a condition goes false, the analysis is reset to BLOCKED.
--
--   This process of watching conditions and flipping the ctrled_analysis state
--   will be accomplished by another automous agent (CtrlWatcher.pm)
--
-- semantics:
--   condition_analysis_url  - foreign key to net distributed analysis reference
--   ctrled_analysis_id      - foreign key to analysis table analysis_id

88
CREATE TABLE analysis_ctrl_rule (
Jessica Severin's avatar
Jessica Severin committed
89 90 91
  condition_analysis_url     varchar(255) default '' NOT NULL,
  ctrled_analysis_id         int(10) unsigned NOT NULL,

92
  UNIQUE (condition_analysis_url, ctrled_analysis_id)
Jessica Severin's avatar
Jessica Severin committed
93 94 95
);


Leo Gordon's avatar
Leo Gordon committed
96
-- ---------------------------------------------------------------------------------
Jessica Severin's avatar
Jessica Severin committed
97 98 99 100 101 102 103 104 105 106 107
--
-- Table structure for table 'analysis_job'
--
-- overview:
--   The analysis_job is the heart of this sytem.  It is the kiosk or blackboard
--   where workers find things to do and then post work for other works to do.
--   The job_claim is a UUID set with an UPDATE LIMIT by worker as they fight
--   over the work.  These jobs are created prior to work being done, are claimed
--   by workers, are updated as the work is done, with a final update on completion.
--
-- semantics:
108 109 110
--   analysis_job_id         - autoincrement id
--   prev_analysis_job_id    - previous analysis_job which created this one (and passed input_id)
--   analysis_id             - the analysis_id needed to accomplish this job.
111
--   input_id                - input data passed into Analysis:RunnableDB to control the work
112 113 114 115 116 117
--   job_claim               - UUID set by workers as the fight over jobs
--   hive_id                 - link to hive table to define which worker claimed this job
--   status                  - state the job is in
--   retry_count             - number times job had to be reset when worker failed to run it
--   completed               - timestamp when job was completed
--   branch_code             - switch-like branching control, default=1 (ie true)
Jessica Severin's avatar
Jessica Severin committed
118 119

CREATE TABLE analysis_job (
120 121 122
  analysis_job_id           int(10) NOT NULL auto_increment,
  prev_analysis_job_id      int(10) NOT NULL,  #analysis_job which created this from rules
  analysis_id               int(10) NOT NULL,
123 124
  input_id                  char(255) not null,
  job_claim                 char(40) NOT NULL default '', #UUID
125
  hive_id                   int(10) NOT NULL,
Leo Gordon's avatar
Leo Gordon committed
126
  status                    enum('READY','BLOCKED','CLAIMED','GET_INPUT','RUN','WRITE_OUTPUT','DONE','FAILED') DEFAULT 'READY' NOT NULL,
127 128 129
  retry_count               int(10) default 0 not NULL,
  completed                 datetime NOT NULL,
  branch_code               int(10) default 1 NOT NULL,
130 131
  runtime_msec              int(10) default 0 NOT NULL, 
  query_count               int(10) default 0 NOT NULL, 
Jessica Severin's avatar
Jessica Severin committed
132 133

  PRIMARY KEY                  (analysis_job_id),
134
  UNIQUE KEY input_id_analysis (input_id, analysis_id),
135 136
  INDEX claim_analysis_status  (job_claim, analysis_id, status),
  INDEX analysis_status        (analysis_id, status),
137
  INDEX hive_id                (hive_id)
Jessica Severin's avatar
Jessica Severin committed
138 139 140
);


Leo Gordon's avatar
Leo Gordon committed
141
-- ---------------------------------------------------------------------------------
Jessica Severin's avatar
Jessica Severin committed
142 143 144 145 146 147 148 149 150 151 152
--
-- Table structure for table 'analysis_job_file'
--
-- overview:
--   Table which holds paths to files created by an analysis_job
--   e.g. STDOUT STDERR, temp directory
--   or output data files created by the RunnableDB
--   There can only be one entry of a certain type for a given analysis_job
--
-- semantics:
--   analysis_job_id    - foreign key
153 154
--   hive_id            - link to hive table to define which worker claimed this job
--   retry              - copy of retry_count of job as it was run
Jessica Severin's avatar
Jessica Severin committed
155 156 157 158 159
--   type               - type of file e.g. STDOUT, STDERR, TMPDIR, ...
--   path               - path to file or directory

CREATE TABLE analysis_job_file (
  analysis_job_id         int(10) NOT NULL,
160 161
  hive_id                 int(10) NOT NULL,
  retry                   int(10) NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
162 163 164
  type                    varchar(16) NOT NULL default '',
  path                    varchar(255) NOT NULL,
  
165 166
  UNIQUE KEY job_hive_type  (analysis_job_id, hive_id, type),
  INDEX hive_id             (hive_id)
Jessica Severin's avatar
Jessica Severin committed
167 168 169
);


Leo Gordon's avatar
Leo Gordon committed
170
-- ---------------------------------------------------------------------------------
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
--
-- Table structure for table 'analysis_data'
--
-- overview:
--   Table which holds LONGTEXT data for use by the analysis system.
--   This data is general purpose and it's up to each analysis to
--   determine how to use it
--
-- semantics:
--   analysis_data_id   - primary id
--   data               - text blob which holds the data

CREATE TABLE analysis_data (
  analysis_data_id  int(10) NOT NULL auto_increment,
  data              longtext,

  PRIMARY KEY (analysis_data_id),
188
  KEY data (data(100))
189 190 191
);


Leo Gordon's avatar
Leo Gordon committed
192
-- ---------------------------------------------------------------------------------
Jessica Severin's avatar
Jessica Severin committed
193 194 195 196 197 198 199 200 201 202
--
-- Table structure for table 'analysis_stats'
--
-- overview:
--   Parallel table to analysis which provides high level statistics on the
--   state of an analysis and it's jobs.  Used to provide a fast overview, and to
--   provide final approval of 'DONE' which is used by the blocking rules to determine
--   when to unblock other analyses.  Also provides
--
-- semantics:
203 204 205
--   analysis_id          - foreign key to analysis table
--   status               - overview status of the analysis_jobs (cached state)
--   failed_job_tolerance - % of tolerated failed jobs
Jessica Severin's avatar
Jessica Severin committed
206 207 208

CREATE TABLE analysis_stats (
  analysis_id           int(10) NOT NULL,
Leo Gordon's avatar
Leo Gordon committed
209
  status                enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE', 'FAILED')
210
                          DEFAULT 'READY' NOT NULL,
211
  batch_size            int(10) default 1 NOT NULL,
212 213 214 215
  avg_msec_per_job      int(10) default 0 NOT NULL,
  avg_input_msec_per_job      int(10) default 0 NOT NULL,
  avg_run_msec_per_job      int(10) default 0 NOT NULL,
  avg_output_msec_per_job      int(10) default 0 NOT NULL,
216
  hive_capacity         int(10) default 1 NOT NULL,
217 218 219
  behaviour             enum('STATIC', 'DYNAMIC') DEFAULT 'STATIC' NOT NULL,
  input_capacity        int(10) default 4 NOT NULL,
  output_capacity       int(10) default 4 NOT NULL,
220 221 222
  total_job_count       int(10) NOT NULL,
  unclaimed_job_count   int(10) NOT NULL,
  done_job_count        int(10) NOT NULL,
223
  max_retry_count       int(10) default 3 NOT NULL,
224
  failed_job_count      int(10) NOT NULL,
225
  failed_job_tolerance  int(10) default 0 NOT NULL,
226
  num_running_workers   int(10) default 0 NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
227
  num_required_workers  int(10) NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
228
  last_update           datetime NOT NULL,
229
  sync_lock             int(10) default 0 NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
230 231 232 233
  
  UNIQUE KEY   (analysis_id)
);

234 235 236
CREATE TABLE analysis_stats_monitor (
  time                  datetime NOT NULL default '0000-00-00 00:00:00',
  analysis_id           int(10) NOT NULL,
Leo Gordon's avatar
Leo Gordon committed
237
  status                enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE', 'FAILED')
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
                          DEFAULT 'READY' NOT NULL,
  batch_size            int(10) default 1 NOT NULL,
  avg_msec_per_job      int(10) default 0 NOT NULL,
  avg_input_msec_per_job      int(10) default 0 NOT NULL,
  avg_run_msec_per_job      int(10) default 0 NOT NULL,
  avg_output_msec_per_job      int(10) default 0 NOT NULL,
  hive_capacity         int(10) default 1 NOT NULL,
  behaviour             enum('STATIC', 'DYNAMIC') DEFAULT 'STATIC' NOT NULL,
  input_capacity        int(10) default 4 NOT NULL,
  output_capacity       int(10) default 4 NOT NULL,
  total_job_count       int(10) NOT NULL,
  unclaimed_job_count   int(10) NOT NULL,
  done_job_count        int(10) NOT NULL,
  max_retry_count       int(10) default 3 NOT NULL,
  failed_job_count      int(10) NOT NULL,
  failed_job_tolerance  int(10) default 0 NOT NULL,
  num_running_workers   int(10) default 0 NOT NULL,
  num_required_workers  int(10) NOT NULL,
  last_update           datetime NOT NULL,
  sync_lock             int(10) default 0 NOT NULL
);

Leo Gordon's avatar
Leo Gordon committed
260
-- ---------------------------------------------------------------------------------
261 262 263 264 265 266 267 268 269 270 271 272 273
--
-- Table structure for table 'monitor'
--
-- overview:
--   This table stores information about hive performance.
--
-- semantics:
--   time           - timestamp
--   workers        - number of running workers
--   throughput     - average numb of completed jobs per sec. of the hive
--                    (this number is calculated using running workers only)
--   per_worker     - average numb of completed jobs per sec. per worker
--                    (this number is calculated using running workers only)
274
--   analysis       - analysis(es) running at that time
275 276 277 278 279 280

CREATE TABLE monitor (
  time                  datetime NOT NULL default '0000-00-00 00:00:00',
  workers               int(10) NOT NULL default '0',
  throughput            float default NULL,
  per_worker            float default NULL,
281
  analysis              varchar(255) default NULL
282 283
);

284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373

-- The last 3 tables are from the ensembl core schema: meta, analysis and analysis_description.
-- We create them with the 'IF NOT EXISTS' option in case they already exist in the DB.

################################################################################
#
# Table structure for table 'meta' (FROM THE CORE SCHEMA)
#

CREATE TABLE IF NOT EXISTS meta (

  meta_id                     INT NOT NULL AUTO_INCREMENT,
  species_id                  INT UNSIGNED DEFAULT 1,
  meta_key                    VARCHAR(40) NOT NULL,
  meta_value                  VARCHAR(255) BINARY NOT NULL,

  PRIMARY   KEY (meta_id),
  UNIQUE    KEY species_key_value_idx (species_id, meta_key, meta_value),
            KEY species_value_idx (species_id, meta_value)

) COLLATE=latin1_swedish_ci TYPE=MyISAM;


################################################################################
#
# Table structure for table 'analysis' (FROM THE CORE SCHEMA)
#
# semantics:
#
# analysis_id - internal id
# created
#   - date to distinguish newer and older versions off the same analysis. Not
#     well maintained so far.
# logic_name - string to identify the analysis. Used mainly inside pipeline.
# db, db_version, db_file
#  - db should be a database name, db version the version of that db
#    db_file the file system location of that database,
#    probably wiser to generate from just db and configurations
# program, program_version,program_file
#  - The binary used to create a feature. Similar semantic to above
# module, module_version
#  - Perl module names (RunnableDBS usually) executing this analysis.
# parameters - a paramter string which is processed by the perl module
# gff_source, gff_feature
#  - how to make a gff dump from features with this analysis


CREATE TABLE IF NOT EXISTS analysis (

  analysis_id                 SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT,
  created                     datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
  logic_name                  VARCHAR(40) NOT NULL,
  db                          VARCHAR(120),
  db_version                  VARCHAR(40),
  db_file                     VARCHAR(120),
  program                     VARCHAR(80),
  program_version             VARCHAR(40),
  program_file                VARCHAR(80),
  parameters                  VARCHAR(255),
  module                      VARCHAR(80),
  module_version              VARCHAR(40),
  gff_source                  VARCHAR(40),
  gff_feature                 VARCHAR(40),

  PRIMARY KEY (analysis_id),
  KEY logic_name_idx (logic_name),
  UNIQUE (logic_name)

) COLLATE=latin1_swedish_ci TYPE=MyISAM;


################################################################################
#
# Table structure for table 'analysis_description' (FROM THE CORE SCHEMA)
#

CREATE TABLE IF NOT EXISTS analysis_description (

  analysis_id                  SMALLINT UNSIGNED NOT NULL,
  description                  TEXT,
  display_label                VARCHAR(255),
  displayable                  BOOLEAN NOT NULL DEFAULT 1,
  web_data                     TEXT,

  UNIQUE KEY analysis_idx (analysis_id)

) COLLATE=latin1_swedish_ci TYPE=MyISAM;