tables.sql 9.29 KB
Newer Older
Jessica Severin's avatar
Jessica Severin committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
------------------------------------------------------------------------------------
--
-- Table structure for table 'hive'
--
-- overview:
--   Table which tracks the workers of a hive as they exist out in the world.
--   Workers are created by inserting into this table so that there is only every
--   one instance of a worker object in the world.  As workers live and do work,
--   they update this table, and when they die they update.
--
-- semantics:
--

CREATE TABLE hive (
  hive_id          int(10) NOT NULL auto_increment,
  analysis_id      int(10) NOT NULL,
Abel Ureta-Vidal's avatar
Abel Ureta-Vidal committed
17
  beekeeper        varchar(80) DEFAULT '' NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
18
  host	           varchar(40) DEFAULT '' NOT NULL,
19
  process_id       varchar(40) DEFAULT '' NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
20
21
22
23
24
  work_done        int(11) DEFAULT '0' NOT NULL,
  born	           datetime NOT NULL,
  last_check_in    datetime NOT NULL,
  died             datetime DEFAULT NULL,
  cause_of_death   enum('', 'NO_WORK', 'JOB_LIMIT', 'LIFESPAN', 'FATALITY') DEFAULT '' NOT NULL,
25
  PRIMARY KEY (hive_id)
Jessica Severin's avatar
Jessica Severin committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
);


------------------------------------------------------------------------------------
--
-- Table structure for table 'simple_rule'
--
-- overview:
--   redesign of pipeline rule system.  Basic design is simplifed so that a
--   'rule' is simply a link from one analysis object to another
--     (ie an edge in a flowchart or object interaction diagram where
--      condition_analysis_id => goal_analysis_id)
--   Each analysis object (analysis_id) is a unique node in the
--   graph that describes the pipeline system.
--     (ie each analysis_id is an 'Instance' of the module it points to)
--   Main reason for redesign that by making a single table we can implement
--   a UNIQUE constraint so that the pipeline can modify itself as it runs
--   and avoid race conditions where the same link is created multiple times
--
-- semantics:
--   simple_rule_id           - internal ID
--   condition_analysis_id    - foreign key to analysis table analysis_id
--   goal_analysis_id         - foreign key to analysis table analysis_id
--   branch_code              - joined to analysis_job.branch_code to allow branching

CREATE TABLE simple_rule (
52
  simple_rule_id           int(10) unsigned not null auto_increment,
Jessica Severin's avatar
Jessica Severin committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
  condition_analysis_id    int(10) unsigned NOT NULL,
  goal_analysis_id         int(10) unsigned NOT NULL,
  branch_code              int(10) default 1 NOT NULL,

  PRIMARY KEY (simple_rule_id),
  UNIQUE (condition_analysis_id, goal_analysis_id)
);


------------------------------------------------------------------------------------
--
-- Table structure for table 'dataflow_rule'
--
-- overview:
--   Extension of simple_rule design except that goal(to) is now in extended URL format e.g.
--   mysql://ensadmin:<pass>@ecs2:3361/compara_hive_test?analysis.logic_name='blast_NCBI34'
--   (full network address of an analysis).  The only requirement is that there are rows in 
--   the analysis_job, analysis, dataflow_rule, and hive tables so that the following join
--   works on the same database 
--   WHERE analysis.analysis_id = dataflow_rule.from_analysis_id 
--   AND   analysis.analysis_id = analysis_job.analysis_id
--   AND   analysis.analysis_id = hive.analysis_id
--
--   These are the rules used to create entries in the analysis_job table where the
--   input_id (control data) is passed from one analysis to the next to define work.
--  
--   The analysis table will be extended so that it can specify different read and write
--   databases, with the default being the database the analysis is on
--
-- semantics:
83
--   dataflow_rule_id     - internal ID
Jessica Severin's avatar
Jessica Severin committed
84
85
86
87
88
--   from_analysis_id     - foreign key to analysis table analysis_id
--   to_analysis_url      - foreign key to net distributed analysis logic_name reference
--   branch_code          - joined to analysis_job.branch_code to allow branching

CREATE TABLE dataflow_rule (
89
  dataflow_rule_id    int(10) unsigned not null auto_increment,
Jessica Severin's avatar
Jessica Severin committed
90
91
92
93
  from_analysis_id    int(10) unsigned NOT NULL,
  to_analysis_url     varchar(255) default '' NOT NULL,
  branch_code         int(10) default 1 NOT NULL,

94
  PRIMARY KEY (dataflow_rule_id),
Jessica Severin's avatar
Jessica Severin committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  UNIQUE (from_analysis_id, to_analysis_url)
);


------------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_ctrl_rule'
--
-- overview:
--   These rules define a higher level of control.  These rules are used to turn
--   whole anlysis nodes on/off (READY/BLOCKED).
--   If any of the condition_analyses are not 'DONE' the ctrled_analysis is set BLOCKED
--   When all conditions become 'DONE' then ctrled_analysis is set to READY
--   The workers switch the analysis.status to 'WORKING' and 'DONE'.
--   But any moment if a condition goes false, the analysis is reset to BLOCKED.
--
--   This process of watching conditions and flipping the ctrled_analysis state
--   will be accomplished by another automous agent (CtrlWatcher.pm)
--
-- semantics:
--   condition_analysis_url  - foreign key to net distributed analysis reference
--   ctrled_analysis_id      - foreign key to analysis table analysis_id

118
CREATE TABLE analysis_ctrl_rule (
Jessica Severin's avatar
Jessica Severin committed
119
120
121
  condition_analysis_url     varchar(255) default '' NOT NULL,
  ctrled_analysis_id         int(10) unsigned NOT NULL,

122
  UNIQUE (condition_analysis_url, ctrled_analysis_id)
Jessica Severin's avatar
Jessica Severin committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
);


------------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_job'
--
-- overview:
--   The analysis_job is the heart of this sytem.  It is the kiosk or blackboard
--   where workers find things to do and then post work for other works to do.
--   The job_claim is a UUID set with an UPDATE LIMIT by worker as they fight
--   over the work.  These jobs are created prior to work being done, are claimed
--   by workers, are updated as the work is done, with a final update on completion.
--
-- semantics:
--   analysis_job_id        - autoincrement id
--   input_analysis_job_id  - previous analysis_job which created this one (and passed input_id)
--   analysis_id            - the analysis_id needed to accomplish this job.
--   input_id               - input data passed into Analysis:RunnableDB to control the work
--   job_claim              - UUID set by workers as the fight over jobs
--   hive_id                - link to hive table to define which worker claimed this job
--   status                 - state the job is in
--   retry_count            - number times job had to be reset when worker failed to run it
--   completed              - timestamp when job was completed
--   branch_code            - switch-like branching control, default=1 (ie true)

CREATE TABLE analysis_job (
  analysis_job_id        int(10) NOT NULL auto_increment,
  input_analysis_job_id  int(10) NOT NULL,  #analysis_job which created this from rules
  analysis_id            int(10) NOT NULL,
  input_id               varchar(100) not null,
  job_claim              varchar(40) NOT NULL default '', #UUID
  hive_id                int(10) NOT NULL,
156
  status                 enum('READY','BLOCKED','CLAIMED','GET_INPUT','RUN','WRITE_OUTPUT','DONE','FAILED') DEFAULT 'READY' NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
157
158
159
160
161
162
  retry_count            int(10) default 0 not NULL,
  completed              datetime NOT NULL,
  branch_code            int(10) default 1 NOT NULL,

  PRIMARY KEY                  (analysis_job_id),
  UNIQUE KEY input_id_analysis (input_id, analysis_id),
163
  INDEX job_claim_analysis     (job_claim, analysis_id),
164
  INDEX job_analysis_status    (analysis_id, status),
165
  INDEX hive_id                (hive_id)
Jessica Severin's avatar
Jessica Severin committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
);


------------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_job_file'
--
-- overview:
--   Table which holds paths to files created by an analysis_job
--   e.g. STDOUT STDERR, temp directory
--   or output data files created by the RunnableDB
--   There can only be one entry of a certain type for a given analysis_job
--
-- semantics:
--   analysis_job_id    - foreign key
--   type               - type of file e.g. STDOUT, STDERR, TMPDIR, ...
--   path               - path to file or directory

CREATE TABLE analysis_job_file (
  analysis_job_id         int(10) NOT NULL,
  type                    varchar(16) NOT NULL default '',
  path                    varchar(255) NOT NULL,
  
  UNIQUE KEY   (analysis_job_id, type)
);


------------------------------------------------------------------------------------
--
-- Table structure for table 'analysis_stats'
--
-- overview:
--   Parallel table to analysis which provides high level statistics on the
--   state of an analysis and it's jobs.  Used to provide a fast overview, and to
--   provide final approval of 'DONE' which is used by the blocking rules to determine
--   when to unblock other analyses.  Also provides
--
-- semantics:
--   analysis_id    - foreign key to analysis table
--   status         - overview status of the analysis_jobs (cached state)

CREATE TABLE analysis_stats (
  analysis_id           int(10) NOT NULL,
  status                enum('BLOCKED', 'READY', 'WORKING', 'ALL_CLAIMED', 'DONE')
                          DEFAULT 'READY' NOT NULL,
211
212
  batch_size            int(10) default 1 NOT NULL,
  hive_capacity         int(10) default 1 NOT NULL,
213
214
215
  total_job_count       int(10) NOT NULL,
  unclaimed_job_count   int(10) NOT NULL,
  done_job_count        int(10) NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
216
  num_required_workers  int(10) NOT NULL,
217
  last_update           timestamp NOT NULL,
Jessica Severin's avatar
Jessica Severin committed
218
219
220
221
222
  
  UNIQUE KEY   (analysis_id)
);