tables.sqlite 18.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

-- The first 3 tables are from the ensembl core schema: meta, analysis and analysis_description.
-- We create them with the 'IF NOT EXISTS' option in case they already exist in the DB.

-- ----------------------------------------------------------------------------------------------
--
-- Table structure for table 'meta' (FROM THE CORE SCHEMA)
--

CREATE TABLE IF NOT EXISTS meta (
  meta_id                     INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
  species_id                  INT UNSIGNED DEFAULT 1,
  meta_key                    VARCHAR(40) NOT NULL,
  meta_value                  TEXT NOT NULL
);
CREATE UNIQUE INDEX IF NOT EXISTS species_key_value_idx ON meta (species_id, meta_key, meta_value);
CREATE        INDEX IF NOT EXISTS species_value_idx     ON meta (species_id, meta_value);



-- ----------------------------------------------------------------------------------------------
--
-- Table structure for table 'analysis' (FROM THE CORE SCHEMA)
--
-- semantics:
--
-- analysis_id - internal id
-- created
--   - date to distinguish newer and older versions off the same analysis. Not
--     well maintained so far.
-- logic_name - string to identify the analysis. Used mainly inside pipeline.
-- db, db_version, db_file
--  - db should be a database name, db version the version of that db
--    db_file the file system location of that database,
--    probably wiser to generate from just db and configurations
-- program, program_version,program_file
--  - The binary used to create a feature. Similar semantic to above
-- module, module_version
--  - Perl module names (RunnableDBS usually) executing this analysis.
-- parameters - a paramter string which is processed by the perl module
-- gff_source, gff_feature
--  - how to make a gff dump from features with this analysis


CREATE TABLE IF NOT EXISTS analysis (
  analysis_id                 INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
  created                     datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
  logic_name                  VARCHAR(40) NOT NULL,
  db                          VARCHAR(120),
  db_version                  VARCHAR(40),
  db_file                     VARCHAR(255),
  program                     VARCHAR(255),
  program_version             VARCHAR(40),
  program_file                VARCHAR(255),
  parameters                  TEXT,
  module                      VARCHAR(255),
  module_version              VARCHAR(40),
  gff_source                  VARCHAR(40),
  gff_feature                 VARCHAR(40)
);
CREATE UNIQUE INDEX IF NOT EXISTS logic_name_idx ON analysis (logic_name);


-- ----------------------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_description' (FROM THE CORE SCHEMA)
--

CREATE TABLE IF NOT EXISTS analysis_description (
  analysis_id                 INTEGER NOT NULL PRIMARY KEY,
  description                 TEXT,
  display_label               VARCHAR(255),
  displayable                 BOOLEAN NOT NULL DEFAULT 1,
  web_data                    TEXT
);
CREATE UNIQUE INDEX IF NOT EXISTS analysis_idx ON analysis_description (analysis_id);



-- ------------------- now, to the 'proper' Hive tables: ----------------------------------------



-- ----------------------------------------------------------------------------------------------
--
-- Table structure for table 'worker'
--
-- overview:
--   Table which tracks the workers of a hive as they exist out in the world.
--   Workers are created by inserting into this table so that there is only every
--   one instance of a worker object in the world.  As workers live and do work,
--   they update this table, and when they die they update.
--
-- semantics:
--

CREATE TABLE worker (
  worker_id        INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
  analysis_id      INTEGER NOT NULL,
  meadow_type      TEXT NOT NULL, /* enum('LSF', 'LOCAL') NOT NULL, */
101
  meadow_name      varchar(40) DEFAULT NULL,
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
  host	           varchar(40) DEFAULT NULL,
  process_id       varchar(40) DEFAULT NULL,
  work_done        int(11) DEFAULT '0' NOT NULL,
  status           TEXT DEFAULT 'READY' NOT NULL, /* enum('READY','COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT','DEAD') DEFAULT 'READY' NOT NULL, */
  born	           timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  last_check_in    datetime NOT NULL,
  died             datetime DEFAULT NULL,
  cause_of_death   TEXT DEFAULT '' NOT NULL  /* enum('', 'NO_WORK', 'JOB_LIMIT', 'HIVE_OVERLOAD', 'LIFESPAN', 'CONTAMINATED', 'KILLED_BY_USER', 'MEMLIMIT', 'RUNLIMIT', 'FATALITY') DEFAULT '' NOT NULL */
);
CREATE        INDEX IF NOT EXISTS analysis_id_status_idx ON worker (analysis_id, status);


-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'dataflow_rule'
--
-- overview:
--   Extension of simple_rule design except that goal(to) is now in extended URL format e.g.
--   mysql://ensadmin:<pass>@ecs2:3361/compara_hive_test?analysis.logic_name='blast_NCBI34'
--   (full network address of an analysis).  The only requirement is that there are rows in 
--   the job, analysis, dataflow_rule, and worker tables so that the following join
--   works on the same database 
--   WHERE analysis.analysis_id = dataflow_rule.from_analysis_id 
--   AND   analysis.analysis_id = job.analysis_id
--   AND   analysis.analysis_id = worker.analysis_id
--
--   These are the rules used to create entries in the job table where the
--   input_id (control data) is passed from one analysis to the next to define work.
--  
--   The analysis table will be extended so that it can specify different read and write
--   databases, with the default being the database the analysis is on
--
-- semantics:
--   dataflow_rule_id     - internal ID
--   from_analysis_id     - foreign key to analysis table analysis_id
137
--   branch_code          - branch_code of the fan
138
--   funnel_dataflow_rule_id - dataflow_rule_id of the semaphored funnel (is NULL by default, which means dataflow is not semaphored)
139 140 141
--   to_analysis_url      - foreign key to net distributed analysis logic_name reference
--   input_id_template    - a template for generating a new input_id (not necessarily a hashref) in this dataflow; if undefined is kept original

142

143 144 145 146
CREATE TABLE dataflow_rule (
  dataflow_rule_id    INTEGER PRIMARY KEY AUTOINCREMENT,
  from_analysis_id    INTEGER NOT NULL,
  branch_code         int(10) default 1 NOT NULL,
147
  funnel_dataflow_rule_id INTEGER default NULL,
148
  to_analysis_url     varchar(255) default '' NOT NULL,
149 150
  input_id_template   TEXT DEFAULT NULL
);
151
CREATE UNIQUE INDEX IF NOT EXISTS from_to_branch_template_idx ON dataflow_rule (from_analysis_id, branch_code, funnel_dataflow_rule_id, to_analysis_url, input_id_template);
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233


-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_ctrl_rule'
--
-- overview:
--   These rules define a higher level of control.  These rules are used to turn
--   whole anlysis nodes on/off (READY/BLOCKED).
--   If any of the condition_analyses are not 'DONE' the ctrled_analysis is set BLOCKED
--   When all conditions become 'DONE' then ctrled_analysis is set to READY
--   The workers switch the analysis.status to 'WORKING' and 'DONE'.
--   But any moment if a condition goes false, the analysis is reset to BLOCKED.
--
--   This process of watching conditions and flipping the ctrled_analysis state
--   will be accomplished by another automous agent (CtrlWatcher.pm)
--
-- semantics:
--   condition_analysis_url  - foreign key to net distributed analysis reference
--   ctrled_analysis_id      - foreign key to analysis table analysis_id

CREATE TABLE analysis_ctrl_rule (
  condition_analysis_url     VARCHAR(255) DEFAULT '' NOT NULL,
  ctrled_analysis_id         UNSIGNED INTEGER NOT NULL
);
CREATE UNIQUE INDEX IF NOT EXISTS condition_ctrled_idx ON analysis_ctrl_rule (condition_analysis_url, ctrled_analysis_id);


-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'job'
--
-- overview:
--   The job is the heart of this system.  It is the kiosk or blackboard
--   where workers find things to do and then post work for other works to do.
--   These jobs are created prior to work being done, are claimed by workers,
--   are updated as the work is done, with a final update on completion.
--
-- semantics:
--   job_id                  - autoincrement id
--   prev_job_id             - previous job which created this one (and passed input_id)
--   analysis_id             - the analysis_id needed to accomplish this job.
--   input_id                - input data passed into Analysis:RunnableDB to control the work
--   worker_id               - link to worker table to define which worker claimed this job
--   status                  - state the job is in
--   retry_count             - number times job had to be reset when worker failed to run it
--   completed               - datetime when job was completed
--
--   semaphore_count         - if this count is >0, the job is conditionally blocked (until this count drops to 0 or below).
--                              Default=0 means "nothing is blocking me by default".
--   semaphored_job_id       - the job_id of job S that is waiting for this job to decrease S's semaphore_count.
--                              Default=NULL means "I'm not blocking anything by default".

CREATE TABLE job (
  job_id                    INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
  prev_job_id               INTEGER DEFAULT NULL,  /* the job that created this one using a dataflow rule */
  analysis_id               INTEGER NOT NULL,
  input_id                  char(255) NOT NULL,
  worker_id                 INTEGER DEFAULT NULL,
  status                    TEXT DEFAULT 'READY' NOT NULL, /* enum('READY','BLOCKED','CLAIMED','COMPILATION','GET_INPUT','RUN','WRITE_OUTPUT','DONE','FAILED','PASSED_ON') DEFAULT 'READY' NOT NULL, */
  retry_count               int(10) DEFAULT 0 NOT NULL,
  completed                 datetime DEFAULT NULL,
  runtime_msec              int(10) DEFAULT NULL, 
  query_count               int(10) DEFAULT NULL, 

  semaphore_count           INTEGER DEFAULT 0 NOT NULL,
  semaphored_job_id         INTEGER DEFAULT NULL
);
CREATE UNIQUE INDEX IF NOT EXISTS input_id_analysis_id_idx ON job (input_id, analysis_id);
CREATE        INDEX IF NOT EXISTS analysis_status_sema_retry_idx ON job (analysis_id, status, semaphore_count, retry_count);
CREATE        INDEX IF NOT EXISTS worker_idx ON job (worker_id);


-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'job_message'
--
-- overview:
--      In case a job throws a message (via die/throw), this message is registered in this table.
--      It may or may not indicate that the job was unsuccessful via is_error flag.
--
-- semantics:
234 235
--       job_message_id     - an autoincremented primary id of the message
--               job_id     - the id of the job that threw the message
236 237 238 239 240 241 242 243
--            worker_id     - the worker in charge of the job at the moment
--                 time     - when the message was thrown
--                retry     - retry_count of the job when the message was thrown
--               status     - of the job when the message was thrown
--                  msg     - string that contains the message
--             is_error     - binary flag

CREATE TABLE job_message (
244
  job_message_id            INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
245 246 247 248
  job_id                    INTEGER NOT NULL,
  worker_id                 INTEGER NOT NULL,
  time                      timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
  retry                     int(10) DEFAULT 0 NOT NULL,
249
  status                    TEXT DEFAULT 'UNKNOWN', /* enum('UNKNOWN', 'COMPILATION', 'GET_INPUT', 'RUN', 'WRITE_OUTPUT', 'PASSED_ON') DEFAULT 'UNKNOWN', */
250
  msg                       TEXT,
251
  is_error                  BOOLEAN
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
);
CREATE        INDEX IF NOT EXISTS worker_idx ON job_message (worker_id);
CREATE        INDEX IF NOT EXISTS job_idx ON job_message (job_id);


-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'job_file'
--
-- overview:
--   Table which holds paths to files created by jobs
--   e.g. STDOUT STDERR, temp directory
--   or output data files created by the RunnableDB
--   There can only be one entry of a certain type for a given job
--
-- semantics:
--   job_id             - foreign key
--   worker_id          - link to worker table to define which worker claimed this job
--   retry              - copy of retry_count of job as it was run
271 272 273
--   stdout_file        - path to the job's STDOUT log
--   stderr_file        - path to the job's STDERR log

274 275

CREATE TABLE job_file (
276 277 278 279 280
  job_id                INTEGER NOT NULL,
  retry                 int(10) NOT NULL,
  worker_id             INTEGER NOT NULL,
  stdout_file           varchar(255),
  stderr_file           varchar(255)
281
);
282
CREATE UNIQUE INDEX IF NOT EXISTS job_retry ON job_file (job_id, retry);
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
CREATE        INDEX IF NOT EXISTS worker_idx ON job_file (worker_id);


-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_data'
--
-- overview:
--   Table which holds LONGTEXT data for use by the analysis system.
--   This data is general purpose and it's up to each analysis to
--   determine how to use it
--
-- semantics:
--   analysis_data_id   - primary id
--   data               - text blob which holds the data

CREATE TABLE analysis_data (
  analysis_data_id  INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
  data              longtext
);
CREATE        INDEX IF NOT EXISTS data_idx ON analysis_data (data);


CREATE TABLE resource_description (
    rc_id                 INTEGER NOT NULL,
    meadow_type           TEXT, /* enum('LSF', 'LOCAL') DEFAULT 'LSF' NOT NULL, */
    parameters            varchar(255) DEFAULT '' NOT NULL,
    description           varchar(255),
    PRIMARY KEY(rc_id, meadow_type)
);


-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_stats'
--
-- overview:
--   Parallel table to analysis which provides high level statistics on the
--   state of an analysis and it's jobs.  Used to provide a fast overview, and to
--   provide final approval of 'DONE' which is used by the blocking rules to determine
--   when to unblock other analyses.  Also provides
--
-- semantics:
--   analysis_id          - foreign key to analysis table
--   status               - overview status of the jobs (cached state)
--   failed_job_tolerance - % of tolerated failed jobs
--   rc_id                - resource class id (analyses are grouped into disjoint classes)

CREATE TABLE analysis_stats (
  analysis_id           INTEGER NOT NULL,
  status                TEXT DEFAULT 'READY' NOT NULL, /* enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE', 'FAILED') DEFAULT 'READY' NOT NULL, */
  batch_size                int(10) default 1 NOT NULL,
  avg_msec_per_job          int(10) default 0 NOT NULL,
  avg_input_msec_per_job    int(10) default 0 NOT NULL,
  avg_run_msec_per_job      int(10) default 0 NOT NULL,
  avg_output_msec_per_job   int(10) default 0 NOT NULL,
  hive_capacity         int(10) default 1 NOT NULL,
  behaviour             TEXT DEFAULT 'STATIC' NOT NULL, /* enum('STATIC', 'DYNAMIC') DEFAULT 'STATIC' NOT NULL, */
  input_capacity        int(10) NOT NULL DEFAULT 4,
  output_capacity       int(10) NOT NULL DEFAULT 4,
  total_job_count       int(10) NOT NULL DEFAULT 0,
  unclaimed_job_count   int(10) NOT NULL DEFAULT 0,
  done_job_count        int(10) NOT NULL DEFAULT 0,
  max_retry_count       int(10) NOT NULL DEFAULT 3,
  failed_job_count      int(10) NOT NULL DEFAULT 0,
  failed_job_tolerance  int(10) NOT NULL DEFAULT 0,
  num_running_workers   int(10) NOT NULL DEFAULT 0,
  num_required_workers  int(10) NOT NULL DEFAULT 0,
  last_update           datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
  sync_lock             int(10) NOT NULL DEFAULT 0,
  rc_id                 INTEGER NOT NULL DEFAULT 0,
354 355
  can_be_empty          TINYINT UNSIGNED DEFAULT 0 NOT NULL,
  priority              TINYINT DEFAULT 0 NOT NULL
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
);
CREATE UNIQUE INDEX IF NOT EXISTS analysis_idx ON analysis_stats (analysis_id);


CREATE TABLE analysis_stats_monitor (
  time                  datetime NOT NULL default '0000-00-00 00:00:00',
  analysis_id           INTEGER NOT NULL,
  status                TEXT DEFAULT 'READY' NOT NULL, /* enum('BLOCKED', 'LOADING', 'SYNCHING', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE', 'FAILED') DEFAULT 'READY' NOT NULL, */
  batch_size            int(10) default 1 NOT NULL,
  avg_msec_per_job      int(10) default 0 NOT NULL,
  avg_input_msec_per_job      int(10) default 0 NOT NULL,
  avg_run_msec_per_job      int(10) default 0 NOT NULL,
  avg_output_msec_per_job      int(10) default 0 NOT NULL,
  hive_capacity         int(10) default 1 NOT NULL,
  behaviour             TEXT DEFAULT 'STATIC' NOT NULL, /* enum('STATIC', 'DYNAMIC') DEFAULT 'STATIC' NOT NULL, */
  input_capacity        int(10) default 4 NOT NULL,
  output_capacity       int(10) default 4 NOT NULL,
  total_job_count       int(10) NOT NULL DEFAULT 0,
  unclaimed_job_count   int(10) NOT NULL DEFAULT 0,
  done_job_count        int(10) NOT NULL DEFAULT 0,
  max_retry_count       int(10) default 3 NOT NULL,
  failed_job_count      int(10) NOT NULL DEFAULT 0,
  failed_job_tolerance  int(10) default 0 NOT NULL,
  num_running_workers   int(10) default 0 NOT NULL,
  num_required_workers  int(10) NOT NULL DEFAULT 0,
  last_update           datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
  sync_lock             int(10) default 0 NOT NULL,
  rc_id                 INTEGER DEFAULT 0 NOT NULL,
384 385
  can_be_empty          TINYINT UNSIGNED DEFAULT 0 NOT NULL,
  priority              TINYINT DEFAULT 0 NOT NULL
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
);

-- ---------------------------------------------------------------------------------
--
-- Table structure for table 'monitor'
--
-- overview:
--   This table stores information about hive performance.
--
-- semantics:
--   time           - datetime
--   workers        - number of running workers
--   throughput     - average numb of completed jobs per sec. of the hive
--                    (this number is calculated using running workers only)
--   per_worker     - average numb of completed jobs per sec. per worker
--                    (this number is calculated using running workers only)
--   analysis       - analysis(es) running at that time

CREATE TABLE monitor (
  time                  datetime NOT NULL default '0000-00-00 00:00:00',
  workers               int(10) NOT NULL default '0',
  throughput            float default NULL,
  per_worker            float default NULL,
  analysis              varchar(255) default NULL  /* not just one, but a list of logic_names */
);


-- Auto add schema version to database (should be overridden by Compara's table.sql)
Matthieu Muffato's avatar
Matthieu Muffato committed
414
INSERT OR IGNORE INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'schema_version', '67');
415