attrib_type.txt 15.3 KB
Newer Older
1 2 3
# this file is intended to hold all valid attrib_type
# table entries for all ensembl databases that we release
#
ml6's avatar
ml6 committed
4
# If you use the provided upload script, commentlines and
5 6 7 8 9
# emptry lines should be automatically removed, all
# other lines should contain tab delimited database entries
# for the attrib_type table

# each attribute type should be preceeded with a comment that
10 11
# describes its uses, unless its description field is deemed to be
# expressive enough
12

13
# need to document and find out about each attrib_type
14 15


16
1	embl_acc	EMBL accession
17

18 19 20 21
2	status	Status

3	synonym	Synonym

22
4	name	Name	Alternative/long name
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

5	type	Type of feature

# A seq_region that is not represented in a more global coordinate system
# should get the toplevel attribute and value 1
# If you have more than one assembly in you database, this feature will
# not work as expected. You should then explicitly request features in a specific
# cordinate system
6	toplevel	Top Level	Top Level Non-Redundant Sequence Region

# The number of genes on each seq_region is counted and stored under this
# seq_region_attribute to be displayed on mapview. Mainly web code uses this.
7	GeneCount	Gene Count	Total Number of Genes

# Same as above for known genes
8	KnownGeneCount	Known Gene Count	Total Number of Known Genes

# same as above for pseudogenes. The criteria for a pseudogene is,
# that the gene.type fieled matches /pseudogene/
9	PseudoGeneCount	PseudoGene Count	Total Number of PseudoGenes

# Snps on a seq_region. See above.
10	SNPCount	SNP Count	Total Number of SNPs

ml6's avatar
ml6 committed
47
# another seq_region attribute. When a seq_region should be used with a
48
# different codon table this attrbutes value should contain its number.
ml6's avatar
ml6 committed
49
# This is a bioperl codon table, find out from there which number to use
50 51 52 53 54 55 56 57 58 59 60 61 62
# for your seq_region
# Useful for Mitochondrium and Bacteria with non standard codon tables
11	codon_table	Codon Table	Alternate codon table

# This is an attribute for a translation. Values describe start and end
# position of a seelnocystein in a Translation (Amino Acid coordinates)
# Example: "123 123 U". This is the general sequence edit format.
# Other attributess with sequence edits for different reasons will come
# up in the future
12	_selenocysteine	Selenocysteine

13	bacend	bacend

ml6's avatar
ml6 committed
63
# Contains the htg phase for clones.
64 65 66 67 68
14	htg	htg	High Throughput phase attribute

15	miRNA	Micro RNA	Coordinates of the mature miRNA

# A sequence region that you consider not part of the reference genome should
69
# be tagged as non_ref in seq_region_attrib. Chromosome 6 haplotypes in human
ml6's avatar
ml6 committed
70
# are exmaples of that.
71 72 73 74
16	non_ref	Non Reference	Non Reference Sequence Region

17	sanger_project	Sanger Project name

ml6's avatar
ml6 committed
75
18	clone_name	Clone name
76

ml6's avatar
ml6 committed
77
19	fish	FISH location
78

ml6's avatar
ml6 committed
79
21	org	Sequencing centre
80

ml6's avatar
ml6 committed
81
22	method	Method
82 83 84 85 86 87 88

23	superctg	Super contig id

24	inner_start	Max start value

25	inner_end	Min end value

ml6's avatar
ml6 committed
89
26	state	Current state of clone
90 91 92

27	organisation	Organisation sequencing clone

ml6's avatar
ml6 committed
93
28	seq_len	Accession length
94

ml6's avatar
ml6 committed
95
29	fp_size	FP size
96 97

30	BACend_flag	BAC end flags
Steve Trevanion's avatar
Steve Trevanion committed
98

99
# used by Vega web code to link WebFPC
Steve Trevanion's avatar
Steve Trevanion committed
100
31	fpc_clone_id	fpc clone
101 102

# additional gene counts for Vega (see GeneCount for general description)
Steve Trevanion's avatar
Steve Trevanion committed
103 104 105
32	KnwnPCCount	protein_coding_KNOWN	Number of Known Protein Coding
33	NovPCCount	protein_coding_NOVEL	Number of Novel Protein Coding
34	NovPTCount	processed_transcript_NOVEL	Number of Novel Processed Transcripts
106 107
35	PutPTCount	processed_transcript_PUTATIVE	Number of Putative Processed Transcripts
36	PredPCCount	protein_coding_PREDICTED	Number of Predicted Protein Coding
Steve Trevanion's avatar
Steve Trevanion committed
108 109
37	IGGeneCount	IG_gene	Number of IG Genes
38	IGPsGenCount	IG_pseudogene	Number of IG Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
110
39	TotPsCount	total_pseudogene	Total Number of Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
111 112
#40	KnwnProcPsCount	processed_pseudogene	Number of Known Processed Pseudogenes
#41	KnwnUnPsCount	unprocessed_pseudogene	Number of Known Unprocessed Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
113 114
42	KnwnPCProgCount	protein_coding_in_progress_KNOWN	Number of Known Protein Coding in progress
43	NovPCProgCount	protein_coding_in_progress_NOVEL	Number of Novel Protein Coding in progress
115 116

# Vega annotation stats
Steve Trevanion's avatar
Steve Trevanion committed
117 118 119
44	AnnotSeqLength	Annotated sequence length	Annotated Sequence
45	TotCloneNum	Total number of clones	Total Number of Clones
46	NumAnnotClone	Fully annotated clones	Number of Fully Annotated Clones
120 121

# Acknowledgements for manual annotation of this seq_region
122
47	ack	Acknowledgement	Acknowledgement for manual annotation
123 124

# old clone attribute
125
48	htg_phase	High throughput phase	High throughput genomic sequencing phase
126
49	description	Description	A general descriptive text attribute
127
50	chromosome	Chromosome	Chromosomal location for supercontigs that are not assembled
128 129
51	nonsense	Nonsense Mutation	Strain specific nonesense mutation

130 131 132 133
# misc Vega attribs
52	author	Author	Group resonsible for Vega annotation
53	author_email	Author email address	Author email address
54	remark	Remark	Annotation remark
Steve Trevanion's avatar
typo  
Steve Trevanion committed
134
55	transcr_class	Transcript class	Transcript class
Steve Trevanion's avatar
Steve Trevanion committed
135
56	KnwnPTCount	processed_transcript_KNOWN	Number of Known Processed Transcripts
Steve Trevanion's avatar
Steve Trevanion committed
136
57	ccds	CCDS	CCDS identifier
137

Glenn Proctor's avatar
Glenn Proctor committed
138
# label frameshifts modelled as short (1,2,4,5 bp) introns
139
59	Frameshift	Frameshift	Frameshift modelled as intron
Steve Trevanion's avatar
Steve Trevanion committed
140 141

#more gene counts for Vega
Steve Trevanion's avatar
Steve Trevanion committed
142
60	PTCount	processed_transcript	Number of Processed Transcripts
Steve Trevanion's avatar
Steve Trevanion committed
143
61	PredPTCount	processed_transcript_PREDICTED	Number of Predicted Processed Transcripts
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161


62	ncRNA	Structure	RNA secondary structure line

63	skip_clone	skip clone  Skip clone in align_by_clone_identity.pl

# Gene counts for seq_region_stats.pl script
64	GeneNo_knwCod	known protein_coding Gene Count	Number of known protein_coding Genes
65	GeneNo_novCod	novel protein_coding Gene Count	Number of novel protein_coding Genes
66	GeneNo_rRNA	rRNA Gene Count	Number of rRNA Genes
67	GeneNo_pseudo	pseudogene Gene Count	Number of pseudogene Genes
68	GeneNo_snRNA	snRNA Gene Count	Number of snRNA Genes
69	GeneNo_snoRNA	snoRNA Gene Count	Number of snoRNA Genes
70	GeneNo_miRNA	miRNA Gene Count	Number of miRNA Genes
71	GeneNo_mscRNA	misc_RNA Gene Count	Number of misc_RNA Genes
72	GeneNo_scRNA	scRNA Gene Count	Number of scRNA Genes
73	GeneNo_MTrRNA	Mt_rRNA Gene Count	Number of Mt_rRNA Genes
74	GeneNo_MTtRNA	Mt_tRNA Gene Count	Number of Mt_tRNA Genes
Andreas Kusalananda Kähäri's avatar
Andreas Kusalananda Kähäri committed
162
75	GeneNo_RNA_pseu	scRNA_pseudogene Gene Count	Number of scRNA_pseudogene Genes
Glenn Proctor's avatar
Glenn Proctor committed
163
76	GeneNo_tRNA	tRNA Gene Count	 Number of tRNA Genes
Bronwen Aken's avatar
Bronwen Aken committed
164
77	GeneNo_rettran	retrotransposed Gene Count	Number of retrotransposed Genes
165
78	GeneNo_snlRNA	snlRNA Gene Count	Number of snlRNA Genes
166
79	GeneNo_proc_tr	processed_transcript Gene Count	Number of processed transcript Genes
Glenn Proctor's avatar
Glenn Proctor committed
167 168
80	supercontig	SuperContig name	NULL
81	well_name	Well plate name	NULL
Ensembl Account's avatar
Ensembl Account committed
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183

# Added by fc1 26/11/06
82	bacterial	Bacterial
83	NovelCDSCount	Novel CDS Count
84	NovelTransCount	Novel Transcript Count
85	PutTransCount	Putative Transcript Count
86	PredTransCount	Predicted Transcript Count
87	UnclassPsCount	Unclass Ps count
88	KnwnprogCount	Known prog Count
89	NovCDSprogCount	Novel CDS prog count
90	bacend_well_nam	BACend well name
91	alt_well_name	Alt well name
92	TranscriptEdge	Transcript Edge
93	alt_embl_acc	Alt EMBL acc
94	alt_org	Alt org
ml6's avatar
ml6 committed
184

Damian Smedley's avatar
Damian Smedley committed
185
# anacode attribs added by ml6 29/11/06 - seen in yeast but not others
ml6's avatar
ml6 committed
186 187
95	intl_clone_name	International Clone Name
96	embl_version	EMBL Version
ml6's avatar
ml6 committed
188
97	chr	Chromosome Name	Chromosome Name Contained in the Assembly
189
98	equiv_asm	Equivalent EnsEMBL assembly	For full chromosomes made from NCBI AGPs
Damian Smedley's avatar
Damian Smedley committed
190 191 192
99	GeneNo_ncRNA	ncRNA Gene Count	Number of ncRNA Genes

# Ig segment gene counts for seq regions stats script ds5 2/2/07
193
100	GeneNo_Ig	Ig Gene Count	Number of Ig Genes
Damian Smedley's avatar
Damian Smedley committed
194 195 196 197 198

# cat missing atts
109	HitSimilarity	hit similarity	percentage id to parent transcripts
110	HitCoverage	hit coverage	coverage of parent transcripts
111	PropNonGap	proportion non gap	proportion non gap
ml6's avatar
ml6 committed
199
112	NumStops	number of stops
Damian Smedley's avatar
Damian Smedley committed
200
113	GapExons	gap exons	number of gap exons
ml6's avatar
ml6 committed
201
114	SourceTran	source transcript	source transcript
Damian Smedley's avatar
Damian Smedley committed
202 203 204
115	EndNotFound	end not found	end not found
116	StartNotFound	start not found	start not found

Steve Trevanion's avatar
Steve Trevanion committed
205 206 207 208 209 210
117	Frameshift Fra	Frameshift modelled as intron

# Other Vega attribs
118	ensembl_name	Ensembl name	Name of equivalent Ensembl chromosome
119	NoAnnotation	NoAnnotation	Clones without manual annotation
120	hap_contig	Haplotype contig	Contig present on a haplotype
ml6's avatar
ml6 committed
211 212 213 214 215 216 217

# loutre attribs added by ml6
121	annotated	Clone Annotation Status
122	keyword	Clone Keyword
123	hidden_remark	Hidden Remark
124	mRNA_start_NF	mRNA start not found
125	mRNA_end_NF	mRNA end not found
Steve Trevanion's avatar
Steve Trevanion committed
218 219
126	cds_start_NF	CDS start not found
127	cds_end_NF	CDS end not found
ml6's avatar
ml6 committed
220 221
128	write_access	Write access for Sequence Set	1 for writable , 0 for read-only
129	hidden	Hidden Sequence Set
Steve Trevanion's avatar
Steve Trevanion committed
222 223

# loutre attribs for vega production (st3)
Steve Trevanion's avatar
Steve Trevanion committed
224
130	vega_name	Vega name	Vega seq_region.name
225
131	vega_export_mod	Export mode	E (External), I (Internal) etc
Steve Trevanion's avatar
Steve Trevanion committed
226
132	vega_release	Vega release	Vega release number
Chao-Kung Chen's avatar
Chao-Kung Chen committed
227 228

# loutre attribs for assembly_tags (ck1)
Chao-Kung Chen's avatar
Chao-Kung Chen committed
229 230 231 232
133	atag_CLE	Clone_left_end	Clone_lef_end feature marked in GAP database
134	atag_CRE	Clone_right_end	Clone_right_end feature marked in GAP database
135	atag_Misc	Misc	miscellaneous feature marked in GAP database
136	atag_Unsure	Unsure	region of uncertain DNA sequence marked in GAP database
233
137	MultAssem	Multiple Assembled seq region	Part of Seq Region is part of more than one assembly
Ian Sealy's avatar
Ian Sealy committed
234 235 236 237


140	wgs	WGS contig	WGS contig integrated into the map
141	bac	AGP clones	tiling path of clones
Glenn Proctor's avatar
Glenn Proctor committed
238 239 240 241

# Attribute for per-gene GC percentage

142	GeneGC	Gene GC	 Percentage GC content for this gene
Stephen Rice's avatar
Stephen Rice committed
242 243

# vega
244
143	TotAssemblyLeng	Finished sequence length	Length of the assembly not counting sequence gaps
245 246

# Drosophila, only where the translation provided by flybase differs from that in our database by ONE amino acid
Bronwen Aken's avatar
Bronwen Aken committed
247
144	amino_acid_sub	Amino acid substitution	Some translations have been manually curated for amino acid substitiutions. For example a stop codon may be changed to an amino acid in order to prevent premature truncation, or one amino acid can be substituted for another.
248 249
# Drosophila. Sometimes sequences have been manually altered to remove one base, and this alters the whole translation
145	_rna_edit	rna_edit	RNA edit
Sarah Dyer's avatar
 
Sarah Dyer committed
250 251 252 253

#genebuild - databases of removed transcripts
146	kill_reason	Kill Reason	Reason why a transcript has been killed
147	strip_UTR	Strip UTR	Transcript needs bad UTR removing
Steve Trevanion's avatar
Steve Trevanion committed
254 255 256

# vega
148	TotAssLength	Finished sequence length	Finished Sequence
Steve Trevanion's avatar
Steve Trevanion committed
257
149	PsCount	pseudogene	Number of Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
258 259
#150	KnwnPsCount	known_pseudogene	Number of Known Pseudogenes
#151	KnwnTPsCount	known_transcribed_pseudogene	Number of Known Transcribed Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
260 261 262
152	TotPTCount	total_processed_transcript	Total Number of Processed Transcripts
153	TotPCCount	total_protein_coding	Total Number of Protein Coding
154	NovNcCount	novel_non_coding	Number of Novel Non Coding
Steve Trevanion's avatar
Steve Trevanion committed
263
155	KnwnPolyPsCount	known_polymorphic	Number of Known Polymorphic Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
264
156	PolyPsCount	polymorphic_pseudogene	Number of Polymorphic Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
265
157	TotIGGeneCount	total_IG_gene	Total Number of IG Genes
Steve Trevanion's avatar
Steve Trevanion committed
266 267 268 269
158	ProcPsCount	proc_pseudogene	Number of Processed Pseudogenes
159	UnPsCount	unproc_pseudogene	Number of Unprocessed Pseudogenes
160	TPsCount	transcribed_pseudogene	Number of Transcribed Pseudogenes
161	TECCount	TEC	Number of TEC Genes
Steve Trevanion's avatar
Steve Trevanion committed
270 271
162	KnwnIGGeneCount	IG_gene_KNOWN	Number of Known IG Genes
163	KnwnIGPsGeCount	IG_pseudogene_KNOWN	Number of Known IG Pseudogenes
272 273 274

#pepstats attributes: will be calculated by release coordinator for the protview page

275 276 277 278 279
164	IsoPoint	Isoelectric point	Pepstats attributes
165	Charge	Charge	Pepstats attributes
166	MolecularWeight	Molecular weight	Pepstats attributes
167	NumResidues	Number of residues	Pepstats attributes
168	AvgResWeight	Ave. residue weight	Pepstats attributes
280

281 282
#old attribute used by the API to translate the sequence
170	initial_met	Initial methionine	Set first amino acid to methionine
283 284
#added for procavia capensis in release 51
171	NonGapHCov	NonGapHCov
285

286
# attribute that shows if a  supporting evidence was also used to build a Vega gene model
287 288 289 290 291
172	otter_support	otter support	Evidence ID that was used as supporting feature for building a gene in Vega

# Temporary attrib using during the merge of the havana and ensembl gene sets to stablish a relation between a
# havana transcript that shares CDS with an Ensembl transcript but the have different UTR structure
173	enst_link	enst link	Code to link a OTTT with an ENST when they both share the CDS of ENST
Julio Fernandez Banet's avatar
Julio Fernandez Banet committed
292 293 294 295 296

# Attribute that show the start genomic position of an alternative ATG found for a transcript. Checks are made 
# up to 200 bases upstream for genes with no UTR, and for genes with UTR is made up to 200 bases of the UTR sequence
# or to the max extent of the UTR if this is shorter than 200.
174	upstream_ATG	upstream ATG	Alternative ATG found upstream of the defined as start ATG for the transcript
Steve Trevanion's avatar
Steve Trevanion committed
297 298

#more vega gene types
Steve Trevanion's avatar
Steve Trevanion committed
299 300 301
175	TPPsCount	transcribed_processed_pseudogene	Number of Transcribed Processed Pseudogenes
176	TUPsCount	transcribed_unprocessed_pseudogene	Number of Transcribed Unprocessed Pseudogenes
177	UniPsCount	unitary_pseudogene	Number of Unitary Pseudogenes
Steve Trevanion's avatar
Steve Trevanion committed
302 303
178	KnwnTECCount	TEC_KNOWN	Number of Known TEC genes
179	TotTECGeneCount	TEC_all	Total number of TEC genes
Steve Trevanion's avatar
Steve Trevanion committed
304 305
180	TUyPsCount	transcribed_unitary_pseudogene	Number of Transcribed Unitary Pseudogenes
181	PolyCount	polymorphic	Number of Polymorphic Genes
Steve Trevanion's avatar
Steve Trevanion committed
306 307 308 309 310
182	KnwnPolyCount	polymorphic	Number of Known Polymorphic Genes
183	KnwnTRCount	TR_gene_known	Number of Known TR Genes
184	TRGeneCount	TR_gene	Number of TR Genes
185	TRPsCount	TR_pseudo	Number of TR Pseudogenes

311 312
# attribute that shows if a supporting evidence was also used to build a
# Vega gene model
313 314 315 316
186	tp_ott_support	otter protein transcript support Evidence ID that was used as supporting feature for building a gene in Vega
187	td_ott_support	otter dna transcript support Evidence ID that was used as supporting feature for building a gene in Vega
188	ep_ott_support	otter protein exon support Evidence ID that was used as supporting feature for building a gene in Vega
189	ed_ott_support	otter dna exon support Evidence ID that was used as supporting feature for building a gene in Vega
317

318 319 320 321 322
# Attributes like split_tscript, incons_strands, incons_phases,
# is_folded, unwanted_evidence, exon_too_long, contains_stops,
# borked_coords, low_complex, and evi_coverage are added by
# BlastMiniGenewise in the build process.  They can be deleted out of
# transcript_attrib and attrib_type table
323 324

190	GeneNo_lincRNA	lincRNA Gene Count	Number of lincRNA Genes
325

326
# StopGained and StopLost are transcript attributes.
327 328 329
191	StopGained	SNP causes stop codon to be gained	This transcript has a variant that causes a stop codon to be gained in at least 10 percent of a HapMap population
192	StopLost	SNP causes stop codon to be lost	This transcript has a variant that causes a stop codon to be lost in at least 10 percent of a HapMap population

330 331 332 333 334 335 336
# For Ensembl Genomes dictyBase (2010-02-16).
# (code 193 should have a trailing '_' as it is auto-generated)
193	GeneNo_class_I_	class_I_RNA Gene Count	Number of class_I_RNA Genes
194	GeneNo_SRP_RNA 	SRP_RNA Gene Count	Number of SRP_RNA Genes
195	GeneNo_class_II	class_II_RNA Gene Count	Number of class_II_RNA Genes
196	GeneNo_P_RNA	RNase_P_RNA Gene Count	Number of RNase_P_RNA Genes
197	GeneNo_RNase_MR	RNase_MRP_RNA Gene Count	Number of RNase_MRP_RNA Genes
337 338 339 340

# Intron loss due to a frameshift on the query genome
198	lost_frameshift	lost_frameshift	Frameshift on the query sequence is lost in the target sequence

341 342 343
#ÊAttributes added as gene_attribs to Ensembl genes that fall within a genomic region where a LRG record exists
216	GeneInLRG	Gene in LRG	This gene is contained within an LRG region
217	GeneOverlapLRG	Gene overlaps LRG	This gene is partially overlapped by a LRG region (start or end outside LRG)