table.sql 102 KB
Newer Older
Magali Ruffier's avatar
Magali Ruffier committed
1
-- Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Tiago Grego's avatar
Tiago Grego committed
2
-- Copyright [2016-2019] EMBL-European Bioinformatics Institute
Tiago Grego's avatar
Tiago Grego committed
3
--
4 5 6
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
Tiago Grego's avatar
Tiago Grego committed
7
--
8
--      http://www.apache.org/licenses/LICENSE-2.0
Tiago Grego's avatar
Tiago Grego committed
9
--
10 11 12 13 14 15
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.

16
# Ensembl core table definitions
17
#
18

19 20 21 22
# Conventions:
#  - use lower case and underscores
#  - internal ids are integers named tablename_id
#  - same name is given in foreign key relations
23

24 25

/**
26
@header Assembly Tables
27
@colour #C70C09
28 29 30 31

*/


Monika Komorowska's avatar
Monika Komorowska committed
32 33
/**
@table assembly
34
@colour #C70C09
35 36 37 38 39 40 41 42
@desc The assembly table states, which parts of seq_regions are exactly equal. It enables to transform coordinates between seq_regions.
Typically this contains how chromosomes are made of contigs, clones out of contigs, and chromosomes out of supercontigs.
It allows you to artificially chunk chromosome sequence into smaller parts.

The data in this table defines the "static golden path", i.e. the best effort draft full genome sequence as determined by the UCSC or NCBI (depending which assembly you are using).
Each row represents a component, e.g. a contig,  (comp_seq_region_id, FK from seq_region table) at least part of which is present in the golden path.
The part of the component that is in the path is delimited by fields cmp_start and cmp_end (start < end), and the absolute position within the golden path chromosome (or other appropriate assembled structure) (asm_seq_region_id) is given by asm_start and asm_end.

43 44
@column asm_seq_region_id            Assembly sequence region id. Primary key, internal identifier. Foreign key references to the @link seq_region table.
@column cmp_seq_region_id            Component sequence region id. Foreign key references to the @link seq_region table.
45 46 47 48 49 50 51
@column asm_start                    Start absolute position within the golden path chromosome.
@column asm_end                      End absolute position within the golden path chromosome.
@column cmp_start                    Component start position within the golden path chromosome.
@column cmp_end                      Component start position within the golden path chromosome.
@column ori                          Orientation: 1 - sense; -1 - antisense.


Monika Komorowska's avatar
Monika Komorowska committed
52
@see seq_region
53
@see supercontigs
54 55

*/
Monika Komorowska's avatar
Monika Komorowska committed
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74


CREATE TABLE assembly (

  asm_seq_region_id           INT(10) UNSIGNED NOT NULL,
  cmp_seq_region_id           INT(10) UNSIGNED NOT NULL,
  asm_start                   INT(10) NOT NULL,
  asm_end                     INT(10) NOT NULL,
  cmp_start                   INT(10) NOT NULL,
  cmp_end                     INT(10) NOT NULL,
  ori                         TINYINT  NOT NULL,

  KEY cmp_seq_region_idx (cmp_seq_region_id),
  KEY asm_seq_region_idx (asm_seq_region_id, asm_start),
  UNIQUE KEY all_idx (asm_seq_region_id, cmp_seq_region_id, asm_start, asm_end, cmp_start, cmp_end, ori)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


Monika Komorowska's avatar
Monika Komorowska committed
75 76
/**
@table assembly_exception
77
@colour #C70C09
78 79 80
@desc Allows multiple sequence regions to point to the same sequence, analogous to a symbolic link in a filesystem pointing to the actual file.
This mechanism has been implemented specifically to support haplotypes and PARs, but may be useful for other similar structures in the future.

81
@column assembly_exception_id       Assembly exception sequence region id. Primary key, internal identifier.
82 83 84 85 86
@column seq_region_id               Sequence region id. Foreign key references to the @link seq_region table.
@column seq_region_start            Sequence start position.
@column seq_region_end              Sequence end position.
@column exc_type                    Exception type, e.g. PAR, HAP - haplotype.
@column exc_seq_region_id           Exception sequence region id. Foreign key references to the @link seq_region table.
87 88
@column exc_seq_region_start        Exception sequence start position.
@column exc_seq_region_end          Exception sequence end position.
89 90 91 92 93 94 95
@column ori                         Orientation: 1 - sense; -1 - antisense.


@see assembly

*/

Monika Komorowska's avatar
Monika Komorowska committed
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116

CREATE TABLE assembly_exception (

  assembly_exception_id       INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  seq_region_id               INT(10) UNSIGNED NOT NULL,
  seq_region_start            INT(10) UNSIGNED NOT NULL,
  seq_region_end              INT(10) UNSIGNED NOT NULL,
  exc_type                    ENUM('HAP', 'PAR',
                                'PATCH_FIX', 'PATCH_NOVEL') NOT NULL,
  exc_seq_region_id           INT(10) UNSIGNED NOT NULL,
  exc_seq_region_start        INT(10) UNSIGNED NOT NULL,
  exc_seq_region_end          INT(10) UNSIGNED NOT NULL,
  ori                         INT NOT NULL,

  PRIMARY KEY (assembly_exception_id),
  KEY sr_idx (seq_region_id, seq_region_start),
  KEY ex_idx (exc_seq_region_id, exc_seq_region_start)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


Monika Komorowska's avatar
Monika Komorowska committed
117 118
/**
@table coord_system
119
@colour #C70C09
120 121 122 123
@desc Stores information about the available co-ordinate systems for the species identified through the species_id field.
Note that for each species, there must be one co-ordinate system that has the attribute "top_level" and one that has the attribute "sequence_level".

@column coord_system_id      Primary key, internal identifier.
124 125 126 127 128
@column species_id           Indentifies the species for multi-species databases.
@column name                 Co-oridinate system name, e.g. 'chromosome', 'contig', 'scaffold' etc.
@column version              Assembly.
@column rank                 Co-oridinate system rank.
@column attrib               Co-oridinate system attrib (e.g. "top_level", "sequence_level").
129

130
@see seq_region
Nathan Johnson's avatar
Nathan Johnson committed
131
@see meta_coord
Nathan Johnson's avatar
Nathan Johnson committed
132
@see meta
133 134

*/
Monika Komorowska's avatar
Monika Komorowska committed
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153


CREATE TABLE coord_system (

  coord_system_id             INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  species_id                  INT(10) UNSIGNED NOT NULL DEFAULT 1,
  name                        VARCHAR(40) NOT NULL,
  version                     VARCHAR(255) DEFAULT NULL,
  rank                        INT NOT NULL,
  attrib                      SET('default_version', 'sequence_level'),

  PRIMARY   KEY (coord_system_id),
  UNIQUE    KEY rank_idx (rank, species_id),
  UNIQUE    KEY name_idx (name, version, species_id),
            KEY species_idx (species_id)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


154 155
/**
@table data_file
156
@colour #C70C09
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
@desc Allows the storage of flat file locations used to store large quanitities of data currently unsuitable in a traditional database table.

@column data_file_id      Auto-increment surrogate primary key
@column coord_system_id   Coordinate system this file is linked to. Used to decipher the assembly version it was mapped to
@column analysis_id       Analysis this file is linked to
@column name              Name of the file
@column version_lock      Indicates that this file is only compatible with the current Ensembl release version
@column absolute          Flags that the URL given is fully resolved and should be used without question
@column url               Optional path to the file (can be absolute or relative)
@column file_type         Type of file e.g. BAM, BIGBED, BIGWIG and VCF
*/

CREATE TABLE data_file (
  data_file_id      INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  coord_system_id   INT(10) UNSIGNED NOT NULL,
  analysis_id       SMALLINT UNSIGNED NOT NULL,
  name              VARCHAR(100) NOT NULL,
  version_lock      TINYINT(1) DEFAULT 0 NOT NULL,
  absolute          TINYINT(1) DEFAULT 0 NOT NULL,
  url               TEXT,
177
  file_type         ENUM('BAM','BAMCOV','BIGBED','BIGWIG','VCF'),
178 179

  PRIMARY KEY (data_file_id),
180 181 182
  UNIQUE KEY df_unq_idx (coord_system_id, analysis_id, name, file_type),
  KEY df_name_idx (name),
  KEY df_analysis_idx (analysis_id)
183 184 185
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


Monika Komorowska's avatar
Monika Komorowska committed
186
/**
Monika Komorowska's avatar
Monika Komorowska committed
187
@table dna
188
@colour #C70C09
189
@desc Contains DNA sequence. This table has a 1:1 relationship with the seq_region table.
190

Monika Komorowska's avatar
Monika Komorowska committed
191 192
@column seq_region_id           Primary key, internal identifier. Foreign key references to the @link seq_region table.
@column sequence                DNA sequence.
193

Monika Komorowska's avatar
Monika Komorowska committed
194 195
@see seq_region
*/
Monika Komorowska's avatar
Monika Komorowska committed
196 197


Monika Komorowska's avatar
Monika Komorowska committed
198
CREATE TABLE dna (
Monika Komorowska's avatar
Monika Komorowska committed
199

Monika Komorowska's avatar
Monika Komorowska committed
200 201
  seq_region_id       INT(10) UNSIGNED NOT NULL,
  sequence            LONGTEXT NOT NULL,
Monika Komorowska's avatar
Monika Komorowska committed
202

Monika Komorowska's avatar
Monika Komorowska committed
203
  PRIMARY KEY (seq_region_id)
Monika Komorowska's avatar
Monika Komorowska committed
204

Monika Komorowska's avatar
Monika Komorowska committed
205
) COLLATE=latin1_swedish_ci ENGINE=MyISAM MAX_ROWS=750000 AVG_ROW_LENGTH=19000;
Monika Komorowska's avatar
Monika Komorowska committed
206

207

Monika Komorowska's avatar
Monika Komorowska committed
208

Magali Ruffier's avatar
Magali Ruffier committed
209 210
/**
@table genome_statistics
211
@colour #C70C09
Magali Ruffier's avatar
Magali Ruffier committed
212 213 214 215 216 217 218 219 220 221 222 223 224
@desc Contains genome and assembly related statistics
      These include but are not limited to: feature counts, sequence lengths

@column genome_statistics_id  Primary key, internal identifier.
@column statistic            Name of the statistics
@column value                 Corresponding value of the statistics (count/length)
@column species_id            Indentifies the species for multi-species databases.
@column attrib_type_id        To distinguish similar statistics for different cases
@column timestamp             Date the statistics was generated

*/


225
CREATE TABLE genome_statistics (
Magali Ruffier's avatar
Magali Ruffier committed
226 227 228

  genome_statistics_id     INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  statistic                VARCHAR(128) NOT NULL,
229
  value                    BIGINT(11) UNSIGNED DEFAULT '0' NOT NULL,
Magali Ruffier's avatar
Magali Ruffier committed
230 231
  species_id               INT UNSIGNED DEFAULT 1,
  attrib_type_id           INT(10) UNSIGNED DEFAULT NULL,
232
  timestamp                DATETIME DEFAULT NULL,
Magali Ruffier's avatar
Magali Ruffier committed
233 234

  PRIMARY KEY (genome_statistics_id),
235
  UNIQUE KEY stats_uniq (statistic, attrib_type_id, species_id)
Magali Ruffier's avatar
Magali Ruffier committed
236 237 238 239 240 241

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;




Monika Komorowska's avatar
Monika Komorowska committed
242
/**
243
@table karyotype
244
@colour #C70C09
245
@desc Describes bands that can be stained on the chromosome.
246

247 248 249 250 251 252
@column karyotype_id            Primary key, internal identifier.
@column seq_region_id           Foreign key references to the @link seq_region table.
@column seq_region_start        Sequence start position.
@column seq_region_end          Sequence end position.
@column band                    Band.
@column stain                   Stain.
253 254

*/
Monika Komorowska's avatar
Monika Komorowska committed
255 256


257 258
CREATE TABLE karyotype (
  karyotype_id                INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
Monika Komorowska's avatar
Monika Komorowska committed
259 260 261
  seq_region_id               INT(10) UNSIGNED NOT NULL,
  seq_region_start            INT(10) UNSIGNED NOT NULL,
  seq_region_end              INT(10) UNSIGNED NOT NULL,
262 263
  band                        VARCHAR(40) DEFAULT NULL,
  stain                       VARCHAR(40) DEFAULT NULL,
Monika Komorowska's avatar
Monika Komorowska committed
264

265 266
  PRIMARY KEY (karyotype_id),
  KEY region_band_idx (seq_region_id,band)
Monika Komorowska's avatar
Monika Komorowska committed
267

Monika Komorowska's avatar
Monika Komorowska committed
268
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
Monika Komorowska's avatar
Monika Komorowska committed
269 270


271
/**
272
@table meta
273
@colour #C70C09
274 275
@desc Stores data about the data in the current schema. Taxonomy information, version information and the default value for the type column in the assembly table are stored here.
Unlike other tables, data in the meta table is stored as key-value pairs. Also stores (via assembly.mapping keys) the relationships between co-ordinate systems in the assembly table.
276

277 278 279 280
The species_id field of the meta table is used in multi-species databases and makes it possible to have species-specific meta key-value pairs.
The species-specific meta key-value pairs needs to be repeated for each species_id.
Entries in the meta table that are not specific to any one species, such as the schema_version key and any other schema-related information must have their species_id field set to NULL.
The default species_id, and the only species_id value allowed in single-species databases, is 1.
281 282


283 284 285 286
@column meta_id                    Primary key, internal identifier.
@column species_id                 Indentifies the species for multi-species databases.
@column meta_key                   Name of the meta entry, e.g. "schema_version".
@column meta_value                 Corresponding value of the key, e.g. "61".
287

288 289
@see assembly
@see coord_system
290

291
*/
292 293


294
CREATE TABLE IF NOT EXISTS meta (
295

296 297 298
  meta_id                     INT NOT NULL AUTO_INCREMENT,
  species_id                  INT UNSIGNED DEFAULT 1,
  meta_key                    VARCHAR(40) NOT NULL,
299
  meta_value                  VARCHAR(255) NOT NULL,
300

301 302 303
  PRIMARY   KEY (meta_id),
  UNIQUE    KEY species_key_value_idx (species_id, meta_key, meta_value),
            KEY species_value_idx (species_id, meta_value)
304

305
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
Monika Komorowska's avatar
Monika Komorowska committed
306

307

308 309
# Add schema type and schema version to the meta table.
INSERT INTO meta (species_id, meta_key, meta_value) VALUES
Tiago Grego's avatar
Tiago Grego committed
310
  (NULL, 'schema_type', 'core'),
Tiago Grego's avatar
Tiago Grego committed
311
  (NULL, 'schema_version', '97');
312 313 314 315 316

# Patches included in this schema file:
# NOTE: At start of release cycle, remove patch entries from last release.
# NOTE: Avoid line-breaks in values.
INSERT INTO meta (species_id, meta_key, meta_value)
Tiago Grego's avatar
Tiago Grego committed
317
  VALUES (NULL, 'patch', 'patch_96_97_a.sql|schema_version');
318

319 320
INSERT INTO meta (species_id, meta_key, meta_value)
  VALUES (NULL, 'patch', 'patch_96_97_b.sql|biotype_so_term');
Tiago Grego's avatar
Tiago Grego committed
321

Marek Szuba's avatar
Marek Szuba committed
322 323 324 325 326 327 328 329 330 331
INSERT INTO meta (species_id, meta_key, meta_value)
  VALUES (NULL, 'patch', 'patch_96_97_c.sql|rnaproduct_tables');

INSERT INTO meta (species_id, meta_key, meta_value)
  VALUES (NULL, 'patch', 'patch_96_97_d.sql|add_object_type_rnaproduct');

INSERT INTO meta (species_id, meta_key, meta_value)
  VALUES (NULL, 'patch', 'patch_96_97_e.sql|add_stable_id_event_type_rnaproduct');


Monika Komorowska's avatar
Monika Komorowska committed
332
/**
333
@table meta_coord
334
@colour #C70C09
335
@desc Describes which co-ordinate systems the different feature tables use.
336

337 338 339
@column table_name              Ensembl database table name.
@column coord_system_id         Foreign key references to the @link coord_system table.
@column max_length              Longest sequence length.
340

341
@see coord_system
342 343

*/
344

345

346
CREATE TABLE meta_coord (
347

348 349 350
  table_name                  VARCHAR(40) NOT NULL,
  coord_system_id             INT(10) UNSIGNED NOT NULL,
  max_length                  INT,
351

352
  UNIQUE KEY cs_table_name_idx (coord_system_id, table_name)
353

354
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
355

356

Monika Komorowska's avatar
Monika Komorowska committed
357
/**
358
@table seq_region
359
@colour #C70C09
360 361 362
@desc Stores information about sequence regions. The primary key is used as a pointer into the dna table so that actual sequence can be obtained, and the coord_system_id allows sequence regions of multiple types to be stored.
Clones, contigs and chromosomes are all now stored in the seq_region table. Contigs are stored with the co-ordinate system 'contig'.
The relationship between contigs and clones is stored in the assembly table. The relationships between contigs and chromosomes, and between contigs and supercontigs, are stored in the assembly table.
363

364 365 366 367
@column seq_region_id             Primary key, internal identifier.
@column name                      Sequence region name.
@column coord_system_id           Foreign key references to the @link coord_system table.
@column length                    Sequence length.
368

369 370 371

@see dna
@see coord_system
372 373

*/
Monika Komorowska's avatar
Monika Komorowska committed
374 375


376
CREATE TABLE seq_region (
Monika Komorowska's avatar
Monika Komorowska committed
377

378
  seq_region_id               INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
379
  name                        VARCHAR(255) NOT NULL,
380 381
  coord_system_id             INT(10) UNSIGNED NOT NULL,
  length                      INT(10) UNSIGNED NOT NULL,
Monika Komorowska's avatar
Monika Komorowska committed
382

383 384 385
  PRIMARY KEY (seq_region_id),
  UNIQUE KEY name_cs_idx (name, coord_system_id),
  KEY cs_idx (coord_system_id)
Monika Komorowska's avatar
Monika Komorowska committed
386 387

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
Monika Komorowska's avatar
Monika Komorowska committed
388 389


Monika Komorowska's avatar
Monika Komorowska committed
390
/**
391
@table seq_region_synonym
392
@colour #C70C09
393
@desc Allows for storing multiple names for sequence regions.
Monika Komorowska's avatar
Monika Komorowska committed
394

395 396 397 398 399 400 401 402 403 404 405 406
@column seq_region_synonym_id           Primary key, internal identifier.
@column seq_region_id                   Foreign key references to the @link seq_region table.
@column synonym                         Alternative name for sequence region.
@column external_db_id                  Foreign key references to the @link external_db table.

*/


CREATE TABLE seq_region_synonym (

  seq_region_synonym_id       INT UNSIGNED NOT NULL  AUTO_INCREMENT,
  seq_region_id               INT(10) UNSIGNED NOT NULL,
407
  synonym                     VARCHAR(250) NOT NULL,
408
  external_db_id              INT UNSIGNED,
409 410

  PRIMARY KEY (seq_region_synonym_id),
411
  UNIQUE KEY syn_idx (synonym, seq_region_id),
412 413 414 415 416 417 418
  KEY seq_region_idx (seq_region_id)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


/**
@table seq_region_attrib
419
@colour #C70C09
420 421 422
@desc Allows "attributes" to be defined for certain seq_regions. Provides a way of storing extra information about particular seq_regions without adding extra columns to the seq_region table. e.g.

@column seq_region_id       Foreign key references to the @link seq_region table.
423
@column attrib_type_id      Foreign key references to the @link attrib_type table.
424
@column value               Attribute value.
Monika Komorowska's avatar
Monika Komorowska committed
425

426 427
@see seq_region
@see attrib_type
428 429

*/
430

431

432
CREATE TABLE seq_region_attrib (
Patrick Meidl's avatar
Patrick Meidl committed
433

434
  seq_region_id               INT(10) UNSIGNED NOT NULL DEFAULT '0',
Monika Komorowska's avatar
Monika Komorowska committed
435 436
  attrib_type_id              SMALLINT(5) UNSIGNED NOT NULL DEFAULT '0',
  value                       TEXT NOT NULL,
437

Monika Komorowska's avatar
Monika Komorowska committed
438 439
  KEY type_val_idx (attrib_type_id, value(40)),
  KEY val_only_idx (value(40)),
Magali Ruffier's avatar
Magali Ruffier committed
440
  KEY seq_region_idx (seq_region_id),
441
  UNIQUE KEY region_attribx (seq_region_id, attrib_type_id, value(500))
442

443
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
444

445

Monika Komorowska's avatar
Monika Komorowska committed
446
/**
447
@header Fundamental Tables
448
@colour   #808000
449
*/
450

Monika Komorowska's avatar
Monika Komorowska committed
451

452 453
/**
@table alt_allele
454
@colour   #808000
455
@desc Stores information about genes on haplotypes that may be orthologous.
456

457 458 459
@column alt_allele_id          Primary key, internal identifier.
@column gene_id                Foreign key references to the @link gene table.
@column alt_allele_group_id    A group ID to show which alleles are related
460

461
*/
462

463
CREATE TABLE alt_allele (
Tiago Grego's avatar
Tiago Grego committed
464 465
        alt_allele_id INT UNSIGNED AUTO_INCREMENT,
        alt_allele_group_id INT UNSIGNED NOT NULL,
466 467 468
        gene_id INT UNSIGNED NOT NULL,

        PRIMARY KEY (alt_allele_id),
469
        UNIQUE KEY gene_idx (gene_id),
470
        KEY (gene_id,alt_allele_group_id)
471

472
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
473

474 475
/**
@table alt_allele_attrib
476
@colour   #808000
477
@desc Holds all the different attributes assigned to individual alleles.
478

479 480 481
@column alt_allele_id           Primary key, internal identifier.
@column attrib                  Enum of attributes assigned to alternative alleles
*/
Monika Komorowska's avatar
Monika Komorowska committed
482 483


484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
CREATE TABLE alt_allele_attrib (
         alt_allele_id INT UNSIGNED,
         attrib ENUM('IS_REPRESENTATIVE',
                     'IS_MOST_COMMON_ALLELE',
                     'IN_CORRECTED_ASSEMBLY',
                     'HAS_CODING_POTENTIAL',
                     'IN_ARTIFICIALLY_DUPLICATED_ASSEMBLY',
                     'IN_SYNTENIC_REGION',
                     'HAS_SAME_UNDERLYING_DNA_SEQUENCE',
                     'IN_BROKEN_ASSEMBLY_REGION',
                     'IS_VALID_ALTERNATE',
                     'SAME_AS_REPRESENTATIVE',
                     'SAME_AS_ANOTHER_ALLELE',
                     'MANUALLY_ASSIGNED',
                     'AUTOMATICALLY_ASSIGNED'),
499

500
        KEY aa_idx (alt_allele_id,attrib)
501

502
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
503 504


505
/**
Magali Ruffier's avatar
Magali Ruffier committed
506
@table alt_allele_group
507
@colour   #808000
508
@desc A minimal table used for tracking unique alt_allele_group_id's. MySQL does not allow multiple autoincrement fields. Further information about a group could be added here at a later date.
Monika Komorowska's avatar
Monika Komorowska committed
509

510 511
@column alt_allele_group_id     Primary key and only column.
*/
512

513
CREATE TABLE alt_allele_group (
514

515
         alt_allele_group_id INT UNSIGNED AUTO_INCREMENT,
Monika Komorowska's avatar
Monika Komorowska committed
516

517
         PRIMARY KEY (alt_allele_group_id)
518

Monika Komorowska's avatar
Monika Komorowska committed
519
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
520

521

522 523
/**
@table analysis
524
@colour   #808000
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
@desc Usually describes a program and some database that together are used to create a feature on a piece of sequence.
Each feature is marked with an analysis_id. The most important column is logic_name, which is used by the webteam to render a feature correctly on contigview (or even retrieve the right feature).
Logic_name is also used in the pipeline to identify the analysis which has to run in a given status of the pipeline.
The module column tells the pipeline which Perl module does the whole analysis, typically a RunnableDB module.

@column analysis_id                 Primary key, internal identifier.
@column created                     Date to distinguish newer and older versions off the same analysis.
@column logic_name                  String to identify the analysis. Used mainly inside pipeline.
@column db                          Database name.
@column db_version                  Database version.
@column db_file                     File system location of the database.
@column program                     The binary used to create a feature.
@column program_version             The binary version.
@column program_file                File system location of the binary.
@column parameters                  A parameter string which is processed by the perl module.
@column module                      Perl module names (RunnableDBS usually) executing this analysis.
@column module_version              Perl module version.
@column gff_source                  How to make a gff dump from features with this analysis.
@column gff_feature                 How to make a gff dump from features with this analysis.

@see analysis_description

*/

CREATE TABLE IF NOT EXISTS analysis (

  analysis_id                 SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT,
552
  created                     datetime DEFAULT NULL,
553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
  logic_name                  VARCHAR(128) NOT NULL,
  db                          VARCHAR(120),
  db_version                  VARCHAR(40),
  db_file                     VARCHAR(120),
  program                     VARCHAR(80),
  program_version             VARCHAR(40),
  program_file                VARCHAR(80),
  parameters                  TEXT,
  module                      VARCHAR(80),
  module_version              VARCHAR(40),
  gff_source                  VARCHAR(40),
  gff_feature                 VARCHAR(40),

  PRIMARY KEY (analysis_id),
  UNIQUE KEY logic_name_idx (logic_name)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;
Monika Komorowska's avatar
Monika Komorowska committed
570 571


Monika Komorowska's avatar
Monika Komorowska committed
572
/**
573
@table analysis_description
574
@colour   #808000
575
@desc Allows the storage of a textual description of the analysis, as well as a "display label", primarily for the EnsEMBL web site.
576

577 578 579 580 581
@column analysis_id            Primary key, internal identifier. Foreign key references to the @link analysis table.
@column description            Textual description of the analysis.
@column display_label          Display label for the EnsEMBL web site.
@column displayable            Flag indicating if the analysis description is to be displayed on the EnsEMBL web site.
@column web_data               Other data used by the EnsEMBL web site.
Monika Komorowska's avatar
Monika Komorowska committed
582

583
@see analysis
Monika Komorowska's avatar
Monika Komorowska committed
584 585 586 587

*/


588
CREATE TABLE IF NOT EXISTS analysis_description (
Monika Komorowska's avatar
Monika Komorowska committed
589

590 591 592
  analysis_id                  SMALLINT UNSIGNED NOT NULL,
  description                  TEXT,
  display_label                VARCHAR(255) NOT NULL,
593
  displayable                  TINYINT(1) NOT NULL DEFAULT 1,
594
  web_data                     TEXT,
Monika Komorowska's avatar
Monika Komorowska committed
595

596
  UNIQUE KEY analysis_idx (analysis_id)
Monika Komorowska's avatar
Monika Komorowska committed
597 598 599 600

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


601
/**
602
@table attrib_type
603
@colour   #808000
604
@desc Provides codes, names and desctriptions of attribute types.
605

606 607 608 609 610 611
@column attrib_type_id       Primary key, internal identifier.
@column code                 Attribute code, e.g. 'GapExons'.
@column name                 Attribute name, e.g. 'gap exons'.
@column description          Attribute description, e.g. 'number of gap exons'.

@see seq_region_attrib
612 613 614

*/

615 616 617 618

CREATE TABLE attrib_type (

  attrib_type_id              SMALLINT(5) UNSIGNED NOT NULL AUTO_INCREMENT,
619
  code                        VARCHAR(20) NOT NULL DEFAULT '',
620 621 622 623 624 625
  name                        VARCHAR(255) NOT NULL DEFAULT '',
  description                 TEXT,

  PRIMARY KEY (attrib_type_id),
  UNIQUE KEY code_idx (code)

626 627 628 629
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


/**
630
@table dna_align_feature
631
@colour   #808000
632
@desc Stores DNA sequence alignments generated from Blast (or Blast-like) comparisons.
633

634 635 636 637 638 639 640 641 642
@column dna_align_feature_id        Primary key, internal identifier.
@column seq_region_id               Foreign key references to the @link seq_region table.
@column seq_region_start            Sequence start position.
@column seq_region_end              Sequence end position.
@column seq_region_strand           Sequence region strand: 1 - forward; -1 - reverse.
@column hit_start                   Alignment hit start position.
@column hit_end                     Alignment hit end position.
@column hit_strand                  Alignment hit strand: 1 - forward; -1 - reverse.
@column hit_name                    Alignment hit name.
643
@column analysis_id                 Foreign key references to the @link analysis table.
644 645 646 647 648 649
@column score                       Alignment score.
@column evalue                      Alignment e-value.
@column perc_ident                  Alignment percentage identity.
@column cigar_line                  Used to encode gapped alignments.
@column external_db_id              Foreign key references to the @link external_db table.
@column hcoverage                   Hit coverage.
650
@column align_type                  Alignment string type used
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

@see cigar_line

*/

CREATE TABLE dna_align_feature (

  dna_align_feature_id        INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  seq_region_id               INT(10) UNSIGNED NOT NULL,
  seq_region_start            INT(10) UNSIGNED NOT NULL,
  seq_region_end              INT(10) UNSIGNED NOT NULL,
  seq_region_strand           TINYINT(1) NOT NULL,
  hit_start                   INT NOT NULL,
  hit_end                     INT NOT NULL,
  hit_strand                  TINYINT(1) NOT NULL,
  hit_name                    VARCHAR(40) NOT NULL,
  analysis_id                 SMALLINT UNSIGNED NOT NULL,
  score                       DOUBLE,
  evalue                      DOUBLE,
  perc_ident                  FLOAT,
  cigar_line                  TEXT,
672
  external_db_id              INT UNSIGNED,
673
  hcoverage                   DOUBLE,
674
  align_type                  ENUM('ensembl', 'cigar', 'vulgar', 'mdtag') DEFAULT 'ensembl',
675 676 677 678 679 680

  PRIMARY KEY (dna_align_feature_id),
  KEY seq_region_idx (seq_region_id, analysis_id, seq_region_start, score),
  KEY seq_region_idx_2 (seq_region_id, seq_region_start),
  KEY hit_idx (hit_name),
  KEY analysis_idx (analysis_id),
Magali Ruffier's avatar
Magali Ruffier committed
681
  KEY external_db_idx (external_db_id)
682 683 684 685

) COLLATE=latin1_swedish_ci ENGINE=MyISAM MAX_ROWS=100000000 AVG_ROW_LENGTH=80;


686 687
/**
@table dna_align_feature_attrib
688
@colour   #808000
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711
@desc Enables storage of attributes that relate to DNA sequence alignments.

@column dna_align_feature_id        Foreign key references to the @link dna_align_feature table.
@column attrib_type_id              Foreign key references to the @link attrib_type table.
@column value                       Attribute value.

@see dna_align_feature
*/

CREATE TABLE dna_align_feature_attrib (

  dna_align_feature_id        INT(10) UNSIGNED NOT NULL,
  attrib_type_id              SMALLINT(5) UNSIGNED NOT NULL,
  value                       TEXT NOT NULL,

  UNIQUE KEY dna_align_feature_attribx (dna_align_feature_id, attrib_type_id, value(500)),
  KEY dna_align_feature_idx (dna_align_feature_id),
  KEY type_val_idx (attrib_type_id, value(40)),
  KEY val_only_idx (value(40))

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


712 713
/**
@table exon
714
@colour   #808000
715 716 717 718
@desc Stores data about exons. Associated with transcripts via exon_transcript. Allows access to contigs seq_regions.
Note seq_region_start is always less that seq_region_end, i.e. when the exon is on the other strand the seq_region_start is specifying the 3prime end of the exon.

@column exon_id                     Primary key, internal identifier.
719
@column seq_region_id               Foreign key references to the @link seq_region table.
720
@column seq_region_start            Sequence start position.
721
@column seq_region_end              Sequence end position.
722
@column seq_region_strand           Sequence region strand: 1 - forward; -1 - reverse.
723 724 725 726 727 728 729 730 731 732
@column phase                       The place where the intron lands inside the codon - 0 between codons, 1 between the 1st and second base, 2 between the second and 3rd base. Exons therefore have a start phase anda end phase, but introns have just one phase.
@column end_phase                   Usually, end_phase = (phase + exon_length)%3 but end_phase could be -1 if the exon is half-coding and its 3 prime end is UTR.
@column is_current                  1 - exon is current. Always set to 1 in ensembl dbs, but needed for otterlace dbs
@column is_constitutive             1 - exon is constitutive.
@column stable_id                   Release-independent stable identifier.
@column version                     Stable identifier version number.
@column created_date                Date created.
@column modified_date               Date modified.

@see exon_transcript
733 734 735

*/

736 737 738 739 740 741 742 743 744 745 746 747

CREATE TABLE exon (

  exon_id                     INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  seq_region_id               INT(10) UNSIGNED NOT NULL,
  seq_region_start            INT(10) UNSIGNED NOT NULL,
  seq_region_end              INT(10) UNSIGNED NOT NULL,
  seq_region_strand           TINYINT(2) NOT NULL,

  phase                       TINYINT(2) NOT NULL,
  end_phase                   TINYINT(2) NOT NULL,

748 749
  is_current                  TINYINT(1) NOT NULL DEFAULT 1,
  is_constitutive             TINYINT(1) NOT NULL DEFAULT 0,
750 751

  stable_id                   VARCHAR(128) DEFAULT NULL,
752
  version                     SMALLINT UNSIGNED DEFAULT NULL,
753 754
  created_date                DATETIME DEFAULT NULL,
  modified_date               DATETIME DEFAULT NULL,
755 756

  PRIMARY KEY (exon_id),
757
  KEY seq_region_idx (seq_region_id, seq_region_start),
758
  KEY stable_id_idx (stable_id, version)
759

760 761 762 763
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


/**
764
@table exon_transcript
765
@colour   #808000
766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
@desc Relationship table linking exons with transcripts. The rank column indicates the 5' to 3' position of the exon within the transcript, i.e. a rank of 1 means the exon is the 5' most within this transcript.

@column exon_id                Composite key. Foreign key references to the @link exon table.
@column transcript_id          Composite key. Foreign key references to the @link transcript table.
@column rank                   Composite key.

@see exon
@see transcript

*/

CREATE TABLE exon_transcript (

  exon_id                     INT(10) UNSIGNED NOT NULL,
  transcript_id               INT(10) UNSIGNED NOT NULL,
  rank                        INT(10) NOT NULL,

  PRIMARY KEY (exon_id,transcript_id,rank),
  KEY transcript (transcript_id),
  KEY exon (exon_id)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;



/**
@table gene
793
@colour   #808000
794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
@desc Allows transcripts to be related to genes.

@column gene_id                     Primary key, internal identifier.
@column biotype                     Biotype, e.g. protein_coding.
@column analysis_id                 Foreign key references to the @link analysis table.
@column seq_region_id               Foreign key references to the @link seq_region table.
@column seq_region_start            Sequence start position.
@column seq_region_end              Sequence end position.
@column seq_region_strand           Sequence region strand: 1 - forward; -1 - reverse.
@column display_xref_id             External reference for EnsEMBL web site. Foreign key references to the @link xref table.
@column source                      e.g ensembl, havana etc.
@column description                 Gene description
@column is_current                  1 - gene is current. Always set to 1 in ensembl dbs, but needed for otterlace dbs
@column canonical_transcript_id     Foreign key references to the @link transcript table.
@column stable_id                   Release-independent stable identifier.
@column version                     Stable identifier version number.
@column created_date                Date created.
@column modified_date               Date modified.

@see transcript

*/


CREATE TABLE gene (

  gene_id                     INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  biotype                     VARCHAR(40) NOT NULL,
  analysis_id                 SMALLINT UNSIGNED NOT NULL,
  seq_region_id               INT(10) UNSIGNED NOT NULL,
  seq_region_start            INT(10) UNSIGNED NOT NULL,
  seq_region_end              INT(10) UNSIGNED NOT NULL,
  seq_region_strand           TINYINT(2) NOT NULL,
  display_xref_id             INT(10) UNSIGNED,
828
  source                      VARCHAR(40) NOT NULL,
829
  description                 TEXT,
830
  is_current                  TINYINT(1) NOT NULL DEFAULT 1,
831 832
  canonical_transcript_id     INT(10) UNSIGNED NOT NULL,
  stable_id                   VARCHAR(128) DEFAULT NULL,
833
  version                     SMALLINT UNSIGNED DEFAULT NULL,
834 835
  created_date                DATETIME DEFAULT NULL,
  modified_date               DATETIME DEFAULT NULL,
836 837 838 839 840 841 842 843 844 845 846 847 848

  PRIMARY KEY (gene_id),
  KEY seq_region_idx (seq_region_id, seq_region_start),
  KEY xref_id_index (display_xref_id),
  KEY analysis_idx (analysis_id),
  KEY stable_id_idx (stable_id, version),
  KEY canonical_transcript_id_idx (canonical_transcript_id)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


/**
@table gene_attrib
849
@colour   #808000
850 851 852 853 854
@desc Enables storage of attributes that relate to genes.

@column gene_id             Foreign key references to the @link gene table.
@column attrib_type_id      Foreign key references to the @link attrib_type table.
@column value               Attribute value.
855 856 857 858 859


@see gene
*/

860 861 862 863 864 865 866 867 868

CREATE TABLE gene_attrib (

  gene_id                     INT(10) UNSIGNED NOT NULL DEFAULT '0',
  attrib_type_id              SMALLINT(5) UNSIGNED NOT NULL DEFAULT '0',
  value                       TEXT NOT NULL,

  KEY type_val_idx (attrib_type_id, value(40)),
  KEY val_only_idx (value(40)),
Magali Ruffier's avatar
Magali Ruffier committed
869
  KEY gene_idx (gene_id),
870
  UNIQUE KEY gene_attribx (gene_id, attrib_type_id, value(500))
871

872 873 874
) COLLATE=latin1_swedish_ci ENGINE=MyISAM;


Monika Komorowska's avatar
Monika Komorowska committed
875
/**
876
@table protein_align_feature
877
@colour   #808000
878
@desc Stores translation alignments generated from Blast (or Blast-like) comparisons.
Monika Komorowska's avatar
Monika Komorowska committed
879

880 881 882 883 884 885 886 887 888 889 890 891 892 893
@column protein_align_feature_id    Primary key, internal identifier.
@column seq_region_id               Foreign key references to the @link seq_region table.
@column seq_region_start            Sequence start position.
@column seq_region_end              Sequence end position.
@column seq_region_strand           Sequence region strand: 1 - forward; -1 - reverse.
@column hit_start                   Alignment hit start position.
@column hit_end                     Alignment hit end position.
@column hit_name                    Alignment hit name.
@column analysis_id                 Foreign key references to the @link analysis table.
@column score                       Alignment score.
@column evalue                      Alignment e-value.
@column perc_ident                  Alignment percentage identity.
@column cigar_line                  Used to encode gapped alignments.
@column external_db_id              Foreign key references to the @link external_db table.
894 895 896
@column hcoverage                   Alignment hit coverage.
@column align_type                  Alignment string type used

897

898
@see cigar_line
899 900 901 902


*/

903

904
CREATE TABLE protein_align_feature (
905

906 907 908 909 910 911 912 913 914 915 916 917 918
  protein_align_feature_id    INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  seq_region_id               INT(10) UNSIGNED NOT NULL,
  seq_region_start            INT(10) UNSIGNED NOT NULL,
  seq_region_end              INT(10) UNSIGNED NOT NULL,
  seq_region_strand           TINYINT(1) DEFAULT '1' NOT NULL,
  hit_start                   INT(10) NOT NULL,
  hit_end                     INT(10) NOT NULL,
  hit_name                    VARCHAR(40) NOT NULL,
  analysis_id                 SMALLINT UNSIGNED NOT NULL,
  score                       DOUBLE,
  evalue                      DOUBLE,
  perc_ident                  FLOAT,
  cigar_line                  TEXT,
919
  external_db_id              INT UNSIGNED,
920
  hcoverage                   DOUBLE,
921
  align_type                  ENUM('ensembl', 'cigar', 'vulgar', 'mdtag') DEFAULT 'ensembl',
Monika Komorowska's avatar
Monika Komorowska committed
922

923 924 925 926 927 928 929 930 931 932 933 934
  PRIMARY KEY (protein_align_feature_id),
  KEY seq_region_idx (seq_region_id, analysis_id, seq_region_start, score),
  KEY seq_region_idx_2 (seq_region_id, seq_region_start),
  KEY hit_idx (hit_name),
  KEY analysis_idx (analysis_id),
  KEY external_db_idx (external_db_id)

) COLLATE=latin1_swedish_ci ENGINE=MyISAM MAX_ROWS=100000000 AVG_ROW_LENGTH=80;


/**
@table protein_feature
935
@colour   #808000
936 937 938 939 940 941 942 943 944 945 946 947 948 949 950
@desc Describes features on the translations (as opposed to the DNA sequence itself), i.e. parts of the peptide. In peptide co-ordinates rather than contig co-ordinates.

@column protein_feature_id          Primary key, internal identifier.
@column translation_id              Foreign key references to the @link translation table.
@column seq_start                   Sequence start position.
@column seq_end                     Sequence end position.
@column hit_start                   Alignment hit start position.
@column hit_end                     Alignment hit end position.
@column hit_name                    Alignment hit name.
@column analysis_id                 Foreign key references to the @link analysis table.
@column score                       Alignment score.
@column evalue                      Alignment E-value.
@column perc_ident                  Alignment percentage identity.
@column external_data               External data for protein feature.
@column hit_description             Optional description of the hit. This can be a human readable name
951 952
@column cigar_line                  Used to encode gapped alignments.
@column align_type                  Alignment string type used
953 954 955 956 957 958 959 960 961 962 963 964 965 966

@see analysis

*/


CREATE TABLE protein_feature (

  protein_feature_id          INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
  translation_id              INT(10) UNSIGNED NOT NULL,
  seq_start                   INT(10) NOT NULL,
  seq_end                     INT(10) NOT NULL,
  hit_start                   INT(10) NOT NULL,
  hit_end                     INT(10) NOT NULL,
967
  hit_name                    VARCHAR(40) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
968 969 970 971