Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
21659b89
Commit
21659b89
authored
17 years ago
by
Bronwen Aken
Browse files
Options
Downloads
Patches
Plain Diff
Now parses out ALL Dbxrefs from the gff file.
parent
6ebcec1f
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm
+547
-67
547 additions, 67 deletions
...ipts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm
with
547 additions
and
67 deletions
misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm
+
547
−
67
View file @
21659b89
...
@@ -5,6 +5,8 @@
...
@@ -5,6 +5,8 @@
#2L gene [...] ID=CG11023;Dbxref=FlyBase:FBan0011023,FlyBase:FBgn0031208;gbunit=AE003590;synonym=CG11023
#2L gene [...] ID=CG11023;Dbxref=FlyBase:FBan0011023,FlyBase:FBgn0031208;gbunit=AE003590;synonym=CG11023
#2L mRNA [...] ID=CG11023-RA;Dbxref=FlyBase:FBtr008,FlyBase:FBgn003;dbxref_2nd=Gadfly:CG11023-RA;synonym=CG23-RA
#2L mRNA [...] ID=CG11023-RA;Dbxref=FlyBase:FBtr008,FlyBase:FBgn003;dbxref_2nd=Gadfly:CG11023-RA;synonym=CG23-RA
#3R FlyBase gene 8084471 8128509 . + .
#ID=FBgn0003651;Name=svp;Alias=FBgn0011337,FBgn0011492,FBgn0011510,FBgn0038010,FBgn0063263;Ontology_term=SO:0000010,SO:0000087,GO:0004872,GO:0007270,GO:0042331,GO:0005515,GO:0007419,GO:0007503,GO:0045449,GO:0004879,GO:0003700,GO:0005634,GO:0007465,GO:0007462,GO:0007464,GO:0007510,GO:0005737,GO:0007507,GO:0007417,GO:0001700,GO:0006357,GO:0007165,GO:0043565,GO:0003707,GO:0008270,GO:0048749,GO:0001752;Dbxref=FlyBase:FBan0011502,FlyBase_Annotation_IDs:CG11502,INTERPRO:IPR013088,GB:AC007724,GB:AE003695,GB_protein:AAF54773,GB_protein:AAN13541,GB_protein:AAF54774,GB:AI108883,GB:AI402121,GB:AY075272,GB_protein:AAL68139,GB:AY119490,GB_protein:AAM50144,GB:AY129452,GB_protein:AAM76194,GB:BG633933,GB:BI167911,GB:CZ468719,GB:CZ472606,GB:CZ475640,GB:CZ475641,GB:CZ477001,GB:CZ482253,GB:CZ485541,GB:CZ485543,GB:G00472,GB:M28863,GB_protein:AAA62770,GB:M28864,GB_protein:AAA03014,UniProt/Swiss-Prot:P16375,UniProt/Swiss-Prot:P16376,UniProt/TrEMBL:Q8MRP3,INTERPRO:IPR000536,INTERPRO:IPR001628,INTERPRO:IPR001723,INTERPRO:IPR003068,INTERPRO:IPR008946,INTERPRO:IPR013629,dedb:9161,flygrid:66603,hybrigenics:521960,if:/newgene/sevenup.htm,orthologs:ensAG:ENSANGG00000002454,orthologs:ensAM:ENSAPMG00000000116,orthologs:ensCF:ENSCAFG00000008076,orthologs:ensDM:CG12744,orthologs:ensDR:ENSDARG00000017168,orthologs:ensFR:SINFRUG00000127451,orthologs:ensGG:ENSGALG00000007000,orthologs:ensHS:ENSG00000185551,orthologs:ensMM:ENSMUSG00000030551,orthologs:ensPT:ENSPTRG00000007484,orthologs:ensRN:ENSRNOG00000010308,orthologs:ensTN:GSTENG00006911001,orthologs:modCB:WBGene00030075;cyto_range=87B4-87B5;gbunit=AE014297;
package
XrefParser::
Flybase_dmel_GFFv3_Parser
;
package
XrefParser::
Flybase_dmel_GFFv3_Parser
;
...
@@ -44,7 +46,8 @@ sub new {
...
@@ -44,7 +46,8 @@ sub new {
$self
->
external_source_db_name
('
flybase_gff
');
$self
->
external_source_db_name
('
flybase_gff
');
# my @gff_obj =qw( CDS exon gene mRNA ncRNA pseudogene rRNA snRNA snoRNA tRNA );
# my @gff_obj =qw( CDS exon gene mRNA ncRNA pseudogene rRNA snRNA snoRNA tRNA );
my
@gff_obj
=
qw( CDS gene mRNA)
;
# this array may need to change between releases so check that it's updated
my
@gff_obj
=
qw( CDS gene mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA)
;
$self
->
gff_object_types
(
\
@gff_obj
);
$self
->
gff_object_types
(
\
@gff_obj
);
#
#
...
@@ -52,32 +55,73 @@ sub new {
...
@@ -52,32 +55,73 @@ sub new {
#
#
$self
->
gff_name
("
Name=
");
$self
->
gff_name
("
Name=
");
$self
->
gff_synonym
("
Alias=
");
$self
->
gff_dbxref
("
Dbxref=
");
$self
->
gff_dbxref
("
Dbxref=
");
$self
->
gff_2nd_dbxref
("
dbxref_2nd=
");
$self
->
gff_synonym
("
synonym=
");
$self
->
gff_2nd_synonym
("
synonym_2nd=
");
#
#
# hard-coded source-names for different objects out of ./sql/populate_metadata.sql
# hard-coded source-names for different objects out of ./sql/populate_metadata.sql
#
#
# For Alias
$self
->
source_name_fbgn
('
flybase_gene_id
');
# source-name for FBgn
$self
->
source_name_synonym
('
flybase_synonym
');
# source for any Alias
$self
->
source_name_fbtr
('
flybase_transcript_id
');
# source-name for FBtr
# For Name
$self
->
source_name_fbpp
('
flybase_polypeptide_id
');
# source-name for FBpp
$self
->
source_name_name_prefix
('
FlyBaseName_
');
# source for any Name
$self
->
source_name_fban
('
flybase_annotation_id
');
# source-name for FBan
# For Dbxref
$self
->
source_name_symbol
('
flybase_synonym
');
$self
->
source_name_fbgn
('
flybase_gene_id
');
# source-name for ID=FBgn
$self
->
source_name_name
('
flybase_name
');
$self
->
source_name_fbtr
('
flybase_transcript_id
');
# source-name for ID=FBtr
$self
->
source_name_name_prefix
('
FlyBaseName_
');
$self
->
source_name_fbpp
('
flybase_polypeptide_id
');
# source-name for ID=FBpp
$self
->
source_name_fban
('
flybase_annotation_id
');
# source-name for ID=FBan
# gadfly-CG-ids
$self
->
source_name_gadfly_gene
('
gadfly_gene_cgid
');
# For Dbxref=FlyBase_Annotation_IDs
$self
->
source_name_gadfly_gene
('
gadfly_gene_cgid
');
# cg-id from genome annotation drosphila CG0123
$self
->
source_name_gadfly_transcript
('
gadfly_transcript_cgid
');
# For Dbxref=FlyBase_Annotation_IDs
$self
->
source_name_gadfly_transcript
('
gadfly_transcript_cgid
');
# cg-id from genome annotation drosphila CG0123-RA
$self
->
source_name_gadfly_translation
('
gadfly_translation_cgid
');
# For Dbxref=FlyBase_Annotation_IDs
$self
->
source_name_gadfly_translation
('
gadfly_translation_cgid
');
# cg-id from genome annotation drosphila CG0123-PA
$self
->
source_name_affymetrix
('
AFFY_DrosGenome1
');
# For Dbxref=Affymetrix
$self
->
source_name_dgrc1
('
DGRC-1
');
# For Dbxref=DGRC-1
$self
->
source_name_dgrc2
('
DGRC-2
');
# For Dbxref=DGRC-2
$self
->
source_name_drsc
('
DRSC
');
# For Dbxref=DRSC
$self
->
source_name_epd
('
EPD
');
# For Dbxref=EPD
$self
->
source_name_flyreg
('
FlyReg
');
# For Dbxref=FlyReg
$self
->
source_name_gb
('
EMBL
');
# For Dbxref=GB
$self
->
source_name_gbprotein
('
protein_id
');
# For Dbxref=GB_protein
$self
->
source_name_gcr
('
GPCR
');
# For Dbxref=GCR
$self
->
source_name_gi
('
GI
');
# For Dbxref=GI
$self
->
source_name_go
('
GO
');
# For Dbxref=GO
$self
->
source_name_genomeRNAi
('
GenomeRNAi
');
# For Dbxref=GenomeRNAi
$self
->
source_name_interpro
('
Interpro
');
# For Dbxref=INTERPRO
$self
->
source_name_merops
('
MEROPS
');
# For Dbxref=MEROPS
$self
->
source_name_miRBase
('
miRBase
');
# For Dbxref=miRBase
$self
->
source_name_mitodrome
('
MitoDrome
');
# For Dbxref=MitoDrome
$self
->
source_name_nrl3d
('
PDB
');
# For Dbxref=NRL_3D
$self
->
source_name_pdb
('
PDB
');
# For Dbxref=PDB
$self
->
source_name_rfam
('
RFAM
');
# For Dbxref=Rfam
$self
->
source_name_tf
('
TransFac
');
# For Dbxref=TF
$self
->
source_name_uniprotsp
('
Uniprot/SWISSPROT
');
# For Dbxref=UniProt/Swiss-Prot
$self
->
source_name_uniprottr
('
Uniprot/SPTREMBL
');
# For Dbxref=UniProt/TrEMBL
$self
->
source_name_bdgpinsituexpr
('
BDGP_insitu_expr
');
# For Dbxref=bdgpinsituexpr
$self
->
source_name_dedb
('
DEDb
');
# For Dbxref=dedb
$self
->
source_name_drosdel
('
DrosDel
');
# For Dbxref=drosdel
$self
->
source_name_flygrid
('
FlyGrid
');
# For Dbxref=flygrid
$self
->
source_name_hybrigenics
('
hybrigenics
');
# For Dbxref=hybrigenics
$self
->
source_name_if
('
InteractiveFly
');
# For Dbxref=if
$self
->
source_name_prefix_ensAGgene
('
Ens_Ag_gene
');
# For Dbxref=ensAG
$self
->
source_name_prefix_ensAMgene
('
Ens_Am_gene
');
# For Dbxref=ensAM
$self
->
source_name_prefix_ensCEgene
('
Ens_Ce_gene
');
# For Dbxref=ensCE
$self
->
source_name_prefix_ensCFgene
('
Ens_Cf_gene
');
# For Dbxref=ensCF
$self
->
source_name_prefix_ensDMgene
('
Ens_Dm_gene
');
# For Dbxref=ensDM
$self
->
source_name_prefix_ensDRgene
('
Ens_Dr_gene
');
# For Dbxref=ensDR
$self
->
source_name_prefix_ensFRgene
('
Ens_Fr_gene
');
# For Dbxref=ensFR
$self
->
source_name_prefix_ensGGgene
('
Ens_Gg_gene
');
# For Dbxref=ensGG
$self
->
source_name_prefix_ensHSgene
('
Ens_Hs_gene
');
# For Dbxref=ensHS
$self
->
source_name_prefix_ensMMgene
('
Ens_Mm_gene
');
# For Dbxref=ensMM
$self
->
source_name_prefix_ensPTgene
('
Ens_Pt_gene
');
# For Dbxref=ensPT
$self
->
source_name_prefix_ensRNgene
('
Ens_Rn_gene
');
# For Dbxref=ensRN
$self
->
source_name_prefix_ensTNgene
('
Ens_Tn_gene
');
# For Dbxref=ensTN
$self
->
source_name_prefix_modCBgene
('
modCB_gene
');
# For Dbxref=modCB
$self
->
source_name_prefix_modCEgene
('
modCE_gene
');
# For Dbxref=modCE
$self
->
source_name_prefix_modDDgene
('
modDD_gene
');
# For Dbxref=modDD
my
@gene_types
=
qw (gene)
;
my
@gene_types
=
qw (gene)
;
my
@translation_types
=
qw (CDS);
my
@translation_types
=
qw (CDS
protein
);
my
@transcript_types
=
qw (mRNA
ncRNA
snRNA
tRNA
rRNA
pseudogene
);
# The transcript_types may change from release to release so check that this list is up-to-date
my
@transcript_types
=
qw (mRNA
ncRNA
snRNA
tRNA
rRNA
pseudogene
snoRNA
miRNA
);
$self
->
gene_types
(
\
@gene_types
)
;
$self
->
gene_types
(
\
@gene_types
)
;
$self
->
translation_types
(
\
@translation_types
)
;
$self
->
translation_types
(
\
@translation_types
)
;
...
@@ -123,7 +167,7 @@ sub run {
...
@@ -123,7 +167,7 @@ sub run {
my
$flybase_source_id
=
$self
->
get_source
(
$external_source_db_name
);
my
$flybase_source_id
=
$self
->
get_source
(
$external_source_db_name
);
if
(
!
$self
->
create_xrefs
(
$flybase_source_id
,
$file
)){
if
(
!
$self
->
create_xrefs
(
$flybase_source_id
,
$file
)){
return
1
;
return
1
;
}
}
my
@xrefs
=
@
{
$self
->
xrefs
};
my
@xrefs
=
@
{
$self
->
xrefs
};
...
@@ -172,37 +216,41 @@ sub create_xrefs {
...
@@ -172,37 +216,41 @@ sub create_xrefs {
while
(
$_
=
$gff_io
->
getline
()
)
{
while
(
$_
=
$gff_io
->
getline
()
)
{
chomp
;
chomp
;
my
@col
=
split
/\
s+
/
;
my
@col
=
split
/\
t
/
;
if
(
$col
[
3
]){
if
(
$col
[
3
]){
# test if line contains information for object wanted (CDS,mRNA,gene,..)
# test if line contains information for object wanted (CDS,mRNA,gene,..)
if
(
$self
->
line_contains_object_to_process
(
$col
[
2
]
)
){
if
(
$self
->
line_contains_object_to_process
(
$col
[
2
]
)
){
# work out if we have a gene, transcript or translation
my
$type
=
$self
->
set_ensembl_object_type
(
$col
[
2
]);
my
$type
=
$self
->
set_ensembl_object_type
(
$col
[
2
]);
# the 9th column contains all the attributes
my
@desc
=
split
/\;/
,
$col
[
8
];
my
@desc
=
split
/\;/
,
$col
[
8
];
my
$cgid
=
shift
@desc
;
# the ID= is always the first element of this array
if
(
!
$cgid
=~
m/ID=/
){
my
$unique_id
=
shift
@desc
;
print
"
parse-error: There seems to be no Identifier:
$cgid
. Suspicous!
";
if
(
!
$unique_id
=~
m/ID=/
){
return
0
;
throw
("
parse-error: There seems to be no Identifier:
$unique_id
. Suspicous!
");
# print "parse-error: There seems to be no Identifier: $unique_id. Suspicous!";
# return 0;
}
}
# throw("parse-error: There seems to be no Identifier: $cgid. Suspicous!") unless ($cgid=~m/ID=/);
# for a gene, this will be FBgn, for a transcript this will be FBtr, etc
$
cg
id
=~
s/ID=//g
;
$
unique_
id
=~
s/ID=//g
;
# set up xref-entry for EVERY single item
# set up xref-entry for EVERY single item
foreach
my
$item
(
@desc
)
{
foreach
my
$item
(
@desc
)
{
# make all xrefs for type "Name=" in desc-field
# make all xrefs for type "Name=" in desc-field
$self
->
make_name_xref
(
$item
,
$cgid
,
$type
);
# these are FlyBaseName_gene for genes, FlyBaseName_transcript for transcripts, etc
$self
->
make_name_xref
(
$item
,
$unique_id
,
$type
);
$self
->
set_flybase_synonyms
(
$item
,
$
cg
id
);
$self
->
set_flybase_synonyms
(
$item
,
$
unique_
id
);
# make all xrefs for type "Name=" in desc-field
# make all xrefs for type "Name=" in desc-field
$self
->
make_dbxref_xref
(
$item
,
$
cg
id
,
$type
);
$self
->
make_dbxref_xref
(
$item
,
$
unique_
id
,
$type
);
}
}
}
}
}
}
# we don't want to read the line otherwise
}
}
# while ( $_ = $gff_io->getline() ) {
$gff_io
->
close
();
$gff_io
->
close
();
...
@@ -231,17 +279,18 @@ sub set_ensembl_object_type{
...
@@ -231,17 +279,18 @@ sub set_ensembl_object_type{
sub
make_dbxref_xref
{
sub
make_dbxref_xref
{
my
(
$self
,
$item
,
$cgid
,
$type
)
=
@_
;
my
(
$self
,
$item
,
$unique_id
,
$type
)
=
@_
;
# item = attribute
# unique_id = ID
# type = gene, transcript, translation
my
(
$xref
);
my
(
$xref
);
my
$tg1
=
$self
->
gff_dbxref
;
my
$tg1
=
$self
->
gff_dbxref
;
my
$tg2
=
$self
->
gff_2nd_dbxref
;
if
(
$item
=~
/$tg1/
||
$item
=~
/$tg2/
){
if
(
$item
=~
/$tg1/
){
# Dbxref=
# split the xrefs up into a list
my
$dbx1
=
get_fields
(
$item
,
$tg1
);
my
$dbx1
=
get_fields
(
$item
,
$tg1
);
my
$dbx2
=
get_fields
(
$item
,
$tg2
);
my
@dbx
;
my
@dbx
;
push
@dbx
,
@
{
$dbx1
}
if
$dbx1
;
push
@dbx
,
@
{
$dbx1
}
if
$dbx1
;
push
@dbx
,
@
{
$dbx2
}
if
$dbx2
;
foreach
my
$dbx
(
@dbx
)
{
foreach
my
$dbx
(
@dbx
)
{
my
$src_id
=
undef
;
my
$src_id
=
undef
;
...
@@ -259,8 +308,8 @@ sub make_dbxref_xref{
...
@@ -259,8 +308,8 @@ sub make_dbxref_xref{
}
elsif
(
$dbx
=~
m/FBan/
){
}
elsif
(
$dbx
=~
m/FBan/
){
$src_id
=
$self
->
get_source
(
$self
->
source_name_fban
);
$src_id
=
$self
->
get_source
(
$self
->
source_name_fban
);
}
}
}
elsif
(
$dbx
=~
m/
Gadfly
:/
){
}
elsif
(
$dbx
=~
m/
FlyBase_Annotation_IDs
:/
){
$dbx
=~
s/
Gadfly
://g
;
$dbx
=~
s/
FlyBase_Annotation_IDs
://g
;
if
(
$type
eq
"
gene
"){
if
(
$type
eq
"
gene
"){
$src_id
=
$self
->
get_source
(
$self
->
source_name_gadfly_gene
)
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_gadfly_gene
)
;
}
}
...
@@ -270,6 +319,140 @@ sub make_dbxref_xref{
...
@@ -270,6 +319,140 @@ sub make_dbxref_xref{
elsif
(
$type
eq
"
transcript
"){
elsif
(
$type
eq
"
transcript
"){
$src_id
=
$self
->
get_source
(
$self
->
source_name_gadfly_transcript
);
$src_id
=
$self
->
get_source
(
$self
->
source_name_gadfly_transcript
);
}
}
}
elsif
(
$dbx
=~
m/Affymetrix:/
)
{
$dbx
=~
s/Affymetrix://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_affymetrix
)
;
}
elsif
(
$dbx
=~
m/DGRC-1:/
)
{
$dbx
=~
s/DGRC-1://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_dgrc1
)
;
}
elsif
(
$dbx
=~
m/DGRC-2:/
)
{
$dbx
=~
s/DGRC-2://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_dgrc2
);
}
elsif
(
$dbx
=~
m/DRSC:/
)
{
$dbx
=~
s/DRSC://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_drsc
);
}
elsif
(
$dbx
=~
m/EPD:/
)
{
$dbx
=~
s/EPD://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_epd
);
}
elsif
(
$dbx
=~
m/FlyReg:/
)
{
$dbx
=~
s/FlyReg://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_flyreg
);
}
elsif
(
$dbx
=~
m/GB:/
)
{
$dbx
=~
s/GB://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_gb
);
}
elsif
(
$dbx
=~
m/GB_protein:/
)
{
$dbx
=~
s/GB_protein://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_gbprotein
);
}
elsif
(
$dbx
=~
m/GCR:/
)
{
$dbx
=~
s/GCR://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_gcr
);
}
elsif
(
$dbx
=~
m/GI:/
)
{
$dbx
=~
s/GI://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_gi
);
}
elsif
(
$dbx
=~
m/GO:/
)
{
$dbx
=~
s/GO://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_go
);
}
elsif
(
$dbx
=~
m/GenomeRNAi:/
)
{
$dbx
=~
s/GenomeRNAi://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_genomeRNAi
);
}
elsif
(
$dbx
=~
m/INTERPRO:/
)
{
$dbx
=~
s/INTERPRO://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_interpro
);
}
elsif
(
$dbx
=~
m/MEROPS:/
)
{
$dbx
=~
s/MEROPS://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_merops
);
}
elsif
(
$dbx
=~
m/MIR:/
)
{
$dbx
=~
s/MIR://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_miRBase
);
}
elsif
(
$dbx
=~
m/MITODROME:/
)
{
$dbx
=~
s/MITODROME://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_mitodrome
);
}
elsif
(
$dbx
=~
m/NRL_3D:/
)
{
$dbx
=~
s/NRL_3D://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_nrl3d
);
}
elsif
(
$dbx
=~
m/PDB:/
)
{
$dbx
=~
s/PDB://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_pdb
);
}
elsif
(
$dbx
=~
m/Rfam:/
)
{
$dbx
=~
s/Rfam://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_rfam
);
}
elsif
(
$dbx
=~
m/TF:/
)
{
$dbx
=~
s/TF://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_tf
);
}
elsif
(
$dbx
=~
m/UniProt\/Swiss-Prot:/
)
{
$dbx
=~
s/UniProt\/Swiss-Prot://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_uniprotsp
);
}
elsif
(
$dbx
=~
m/UniProt\/TrEMBL:/
)
{
$dbx
=~
s/UniProt\/TrEMBL://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_uniprottr
);
}
elsif
(
$dbx
=~
m/bdgpinsituexpr:/
)
{
$dbx
=~
s/bdgpinsituexpr://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_bdgpinsituexpr
);
}
elsif
(
$dbx
=~
m/dedb:/
)
{
$dbx
=~
s/dedb://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_dedb
);
}
elsif
(
$dbx
=~
m/drosdel:/
)
{
$dbx
=~
s/drosdel://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_drosdel
);
}
elsif
(
$dbx
=~
m/flygrid:/
)
{
$dbx
=~
s/flygrid://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_flygrid
);
}
elsif
(
$dbx
=~
m/hybrigenics:/
)
{
$dbx
=~
s/hybrigenics://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_hybrigenics
);
}
elsif
(
$dbx
=~
m/if:/
)
{
$dbx
=~
s/if://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_if
);
}
elsif
(
$dbx
=~
m/orthologs:ensAG:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensAGgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensAM:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensAMgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensCE:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensCEgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensCF:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensCFgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensDM:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensDMgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensDR:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensDRgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensFR:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensFRgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensGG:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensGGgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensHS:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensHSgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensMM:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensMMgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensPT:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensPTgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensRN:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensRNgene
);
}
elsif
(
$dbx
=~
m/orthologs:ensTN:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_ensTNgene
);
}
elsif
(
$dbx
=~
m/orthologs:modCB:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_modCBgene
);
}
elsif
(
$dbx
=~
m/orthologs:modCE:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_modCEgene
);
}
elsif
(
$dbx
=~
m/orthologs:modDD:/
)
{
$dbx
=~
s/orthologs://g
;
$src_id
=
$self
->
get_source
(
$self
->
source_name_prefix_modDDgene
);
}
else
{
warning
("
Dbxref type not recognised :
$dbx
");
}
}
if
(
$src_id
){
# only add xref entry for FBgn FBtr...
if
(
$src_id
){
# only add xref entry for FBgn FBtr...
...
@@ -278,13 +461,13 @@ sub make_dbxref_xref{
...
@@ -278,13 +461,13 @@ sub make_dbxref_xref{
$xref
->
{
LABEL
}
=
$dbx
;
$xref
->
{
LABEL
}
=
$dbx
;
$xref
->
{
SOURCE_ID
}
=
$src_id
;
$xref
->
{
SOURCE_ID
}
=
$src_id
;
$xref
->
{
SPECIES_ID
}
=
$self
->
species_id
();
$xref
->
{
SPECIES_ID
}
=
$self
->
species_id
();
$xref
->
{
SYNONYMS
}
=
$self
->
get_synonyms
(
$
cg
id
);
$xref
->
{
SYNONYMS
}
=
$self
->
get_synonyms
(
$
unique_
id
);
$self
->
add_xref
(
$xref
);
$self
->
add_xref
(
$xref
);
if
(
$type
){
if
(
$type
){
my
$direct_xref
;
my
$direct_xref
;
$direct_xref
=
$xref
;
$direct_xref
=
$xref
;
$direct_xref
->
{
ENSEMBL_STABLE_ID
}
=
$
cg
id
;
$direct_xref
->
{
ENSEMBL_STABLE_ID
}
=
$
unique_
id
;
$direct_xref
->
{
ENSEMBL_TYPE
}
=
$type
;
$direct_xref
->
{
ENSEMBL_TYPE
}
=
$type
;
#$direct_xref->{LINKAGE_XREF}=undef;
#$direct_xref->{LINKAGE_XREF}=undef;
$self
->
add_direct_xref
(
$direct_xref
)
if
$type
;
$self
->
add_direct_xref
(
$direct_xref
)
if
$type
;
...
@@ -296,30 +479,26 @@ sub make_dbxref_xref{
...
@@ -296,30 +479,26 @@ sub make_dbxref_xref{
}
}
sub
set_flybase_synonyms
{
sub
set_flybase_synonyms
{
my
(
$self
,
$item
,
$cgid
)
=
@_
;
my
(
$self
,
$item
,
$unique_id
)
=
@_
;
my
@syns
;
my
$syn1
=
$self
->
gff_synonym
;
my
$syn1
=
$self
->
gff_synonym
;
my
$syn2
=
$self
->
gff_2nd_synonym
;
if
(
$item
=~
/$syn1/
||
$item
=~
/$syn2/
){
if
(
$item
=~
/$syn1/
){
my
$s1
=
get_fields
(
$item
,
$syn1
);
my
$s1
=
get_fields
(
$item
,
$syn1
);
my
$s2
=
get_fields
(
$item
,
$syn2
);
my
@syns
;
my
@syns
;
push
@syns
,
@
{
$s1
}
if
$s1
;
push
@syns
,
@
{
$s1
}
if
$s1
;
push
@syns
,
@
{
$s2
}
if
$s2
;
$self
->
add_synonym
(
$unique_id
,
\
@syns
);
$self
->
add_synonym
(
$cgid
,
\
@syns
);
return
\
@syns
;
return
\
@syns
;
}
}
return
undef
;
return
undef
;
}
}
sub
make_name_xref
{
sub
make_name_xref
{
my
(
$self
,
$item
,
$
cg
id
,
$type
)
=
@_
;
my
(
$self
,
$item
,
$
unique_
id
,
$type
)
=
@_
;
my
$xref
=
undef
;
my
$xref
=
undef
;
my
$target
=
$self
->
gff_name
;
my
$target
=
$self
->
gff_name
;
if
(
$item
=~
m/$target/
){
##Name=
if
(
$item
=~
m/$target/
){
##Name=
#print "having $$gff_gene_name[0]\n" ;
#print "having $$gff_gene_name[0]\n" ;
# remove the Name= bit and split the names on a ','
my
$gff_gene_name
=
get_fields
(
$item
,
$target
)
;
my
$gff_gene_name
=
get_fields
(
$item
,
$target
)
;
throw
("
there is more than one id for item
$item
\n
")
if
$$gff_gene_name
[
1
];
throw
("
there is more than one id for item
$item
\n
")
if
$$gff_gene_name
[
1
];
$xref
->
{
ACCESSION
}
=
$$gff_gene_name
[
0
];
$xref
->
{
ACCESSION
}
=
$$gff_gene_name
[
0
];
...
@@ -336,7 +515,7 @@ sub make_name_xref{
...
@@ -336,7 +515,7 @@ sub make_name_xref{
if
(
defined
(
$xref
)
and
$type
){
if
(
defined
(
$xref
)
and
$type
){
my
$direct_xref
;
my
$direct_xref
;
$direct_xref
=
$xref
;
$direct_xref
=
$xref
;
$direct_xref
->
{
ENSEMBL_STABLE_ID
}
=
$
cg
id
;
$direct_xref
->
{
ENSEMBL_STABLE_ID
}
=
$
unique_
id
;
$direct_xref
->
{
ENSEMBL_TYPE
}
=
$type
;
$direct_xref
->
{
ENSEMBL_TYPE
}
=
$type
;
$direct_xref
->
{
LINKAGE_TYPE
}
=
'
bla
';
$direct_xref
->
{
LINKAGE_TYPE
}
=
'
bla
';
$self
->
add_direct_xref
(
$direct_xref
);
$self
->
add_direct_xref
(
$direct_xref
);
...
@@ -381,11 +560,11 @@ sub source_name_name_prefix{
...
@@ -381,11 +560,11 @@ sub source_name_name_prefix{
}
}
sub
source_name_sy
mbol
{
sub
source_name_sy
nonym
{
my
$self
=
shift
;
my
$self
=
shift
;
$self
->
{
_source_name_sy
mbol
}
=
shift
if
@
_
;
$self
->
{
_source_name_sy
nonym
}
=
shift
if
@
_
;
return
$self
->
{
_source_name_sy
mbol
};
return
$self
->
{
_source_name_sy
nonym
};
}
}
...
@@ -439,10 +618,267 @@ sub source_name_fban{
...
@@ -439,10 +618,267 @@ sub source_name_fban{
return
$self
->
{
_sn_fban
};
return
$self
->
{
_sn_fban
};
}
}
sub
source_name_affymetrix
{
my
$self
=
shift
;
$self
->
{
_sn_affymetrix
}
=
shift
if
@
_
;
return
$self
->
{
_sn_affymetrix
};
}
sub
source_name_dgrc1
{
my
$self
=
shift
;
$self
->
{
_sn_dgrc1
}
=
shift
if
@
_
;
return
$self
->
{
_sn_dgrc1
};
}
sub
source_name_dgrc2
{
my
$self
=
shift
;
$self
->
{
_sn_dgrc2
}
=
shift
if
@
_
;
return
$self
->
{
_sn_dgrc2
};
}
sub
source_name_drsc
{
my
$self
=
shift
;
$self
->
{
_sn_drsc
}
=
shift
if
@
_
;
return
$self
->
{
_sn_drsc
};
}
sub
source_name_epd
{
my
$self
=
shift
;
$self
->
{
_sn_epd
}
=
shift
if
@
_
;
return
$self
->
{
_sn_epd
};
}
sub
source_name_flyreg
{
my
$self
=
shift
;
$self
->
{
_sn_flyreg
}
=
shift
if
@
_
;
return
$self
->
{
_sn_flyreg
};
}
sub
source_name_gb
{
my
$self
=
shift
;
$self
->
{
_sn_gb
}
=
shift
if
@
_
;
return
$self
->
{
_sn_gb
};
}
sub
source_name_gbprotein
{
my
$self
=
shift
;
$self
->
{
_sn_gbprotein
}
=
shift
if
@
_
;
return
$self
->
{
_sn_gbprotein
};
}
sub
source_name_gcr
{
my
$self
=
shift
;
$self
->
{
_sn_gcr
}
=
shift
if
@
_
;
return
$self
->
{
_sn_gcr
};
}
sub
source_name_gi
{
my
$self
=
shift
;
$self
->
{
_sn_gi
}
=
shift
if
@
_
;
return
$self
->
{
_sn_gi
};
}
sub
source_name_go
{
my
$self
=
shift
;
$self
->
{
_sn_go
}
=
shift
if
@
_
;
return
$self
->
{
_sn_go
};
}
sub
source_name_genomeRNAi
{
my
$self
=
shift
;
$self
->
{
_sn_genomeRNAi
}
=
shift
if
@
_
;
return
$self
->
{
_sn_genomeRNAi
};
}
sub
source_name_interpro
{
my
$self
=
shift
;
$self
->
{
_sn_interpro
}
=
shift
if
@
_
;
return
$self
->
{
_sn_interpro
};
}
sub
source_name_merops
{
my
$self
=
shift
;
$self
->
{
_sn_merops
}
=
shift
if
@
_
;
return
$self
->
{
_sn_merops
};
}
sub
source_name_miRBase
{
my
$self
=
shift
;
$self
->
{
_sn_miRBase
}
=
shift
if
@
_
;
return
$self
->
{
_sn_miRBase
};
}
sub
source_name_mitodrome
{
my
$self
=
shift
;
$self
->
{
_sn_mitodrome
}
=
shift
if
@
_
;
return
$self
->
{
_sn_mitodrome
};
}
sub
source_name_nrl3d
{
my
$self
=
shift
;
$self
->
{
_sn_nrl3d
}
=
shift
if
@
_
;
return
$self
->
{
_sn_nrl3d
};
}
sub
source_name_pdb
{
my
$self
=
shift
;
$self
->
{
_sn_pdb
}
=
shift
if
@
_
;
return
$self
->
{
_sn_pdb
};
}
sub
source_name_rfam
{
my
$self
=
shift
;
$self
->
{
_sn_rfam
}
=
shift
if
@
_
;
return
$self
->
{
_sn_rfam
};
}
sub
source_name_tf
{
my
$self
=
shift
;
$self
->
{
_sn_tf
}
=
shift
if
@
_
;
return
$self
->
{
_sn_tf
};
}
sub
source_name_uniprotsp
{
my
$self
=
shift
;
$self
->
{
_sn_uniprotsp
}
=
shift
if
@
_
;
return
$self
->
{
_sn_uniprotsp
};
}
sub
source_name_uniprottr
{
my
$self
=
shift
;
$self
->
{
_sn_uniprottr
}
=
shift
if
@
_
;
return
$self
->
{
_sn_uniprottr
};
}
sub
source_name_bdgpinsituexpr
{
my
$self
=
shift
;
$self
->
{
_sn_bdgpinsituexpr
}
=
shift
if
@
_
;
return
$self
->
{
_sn_bdgpinsituexpr
};
}
sub
source_name_dedb
{
my
$self
=
shift
;
$self
->
{
_sn_dedb
}
=
shift
if
@
_
;
return
$self
->
{
_sn_dedb
};
}
sub
source_name_drosdel
{
my
$self
=
shift
;
$self
->
{
_sn_drosdel
}
=
shift
if
@
_
;
return
$self
->
{
_sn_drosdel
};
}
sub
source_name_flygrid
{
my
$self
=
shift
;
$self
->
{
_sn_flygrid
}
=
shift
if
@
_
;
return
$self
->
{
_sn_flygrid
};
}
sub
source_name_hybrigenics
{
my
$self
=
shift
;
$self
->
{
_sn_hybrigenics
}
=
shift
if
@
_
;
return
$self
->
{
_sn_hybrigenics
};
}
sub
source_name_if
{
my
$self
=
shift
;
$self
->
{
_sn_if
}
=
shift
if
@
_
;
return
$self
->
{
_sn_if
};
}
sub
source_name_prefix_ensAGgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensAG
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensAG
};
}
sub
source_name_prefix_ensAMgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensAM
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensAM
};
}
sub
source_name_prefix_ensCEgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensCE
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensCE
};
}
sub
source_name_prefix_ensCFgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensCF
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensCF
};
}
sub
source_name_prefix_ensDMgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensDM
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensDM
};
}
sub
source_name_prefix_ensDRgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensDR
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensDR
};
}
sub
source_name_prefix_ensFRgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensFR
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensFR
};
}
sub
source_name_prefix_ensGGgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensGG
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensGG
};
}
sub
source_name_prefix_ensHSgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensHS
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensHS
};
}
sub
source_name_prefix_ensMMgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensMM
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensMM
};
}
sub
source_name_prefix_ensPTgene
{
my
$self
=
shift
;
$self
->
{
_sn_prefix_ensPT
}
=
shift
if
@
_
;
return
$self
->
{
_sn_prefix_ensPT
};
}
sub
source_name_prefix_ensRNgene
{
my
$self
=
shift
;
$self
->
{
_sn_ensRN
}
=
shift
if
@
_
;
return
$self
->
{
_sn_ensRN
};
}
sub
source_name_prefix_ensTNgene
{
my
$self
=
shift
;
$self
->
{
_sn_ensTN
}
=
shift
if
@
_
;
return
$self
->
{
_sn_ensTN
};
}
sub
source_name_prefix_modCBgene
{
my
$self
=
shift
;
$self
->
{
_sn_modCB
}
=
shift
if
@
_
;
return
$self
->
{
_sn_modCB
};
}
sub
source_name_prefix_modCEgene
{
my
$self
=
shift
;
$self
->
{
_sn_modCE
}
=
shift
if
@
_
;
return
$self
->
{
_sn_modCE
};
}
sub
source_name_prefix_modDDgene
{
my
$self
=
shift
;
$self
->
{
_sn_modDD
}
=
shift
if
@
_
;
return
$self
->
{
_sn_modDD
};
}
sub
gff_name
{
sub
gff_name
{
my
$self
=
shift
;
my
$self
=
shift
;
...
@@ -592,17 +1028,17 @@ sub get_species {
...
@@ -592,17 +1028,17 @@ sub get_species {
}
}
sub
add_synonym
{
sub
add_synonym
{
my
(
$self
,
$
cg
id
,
$synref
)
=
@_
;
my
(
$self
,
$
unique_
id
,
$synref
)
=
@_
;
#print "adding synonym for -$
cg
id-:".join(" " , @$synref)."\n" ; ;
#print "adding synonym for -$
unique_
id-:".join(" " , @$synref)."\n" ; ;
$
{
$self
->
synonyms
}{
$
cg
id
}
=
$synref
if
(
$synref
);
$
{
$self
->
synonyms
}{
$
unique_
id
}
=
$synref
if
(
$synref
);
return
;
return
;
}
}
sub
get_synonyms
{
sub
get_synonyms
{
my
(
$self
,
$
cg
id
)
=
@_
;
my
(
$self
,
$
unique_
id
)
=
@_
;
return
$
{
$self
->
synonyms
}{
$
cg
id
};
return
$
{
$self
->
synonyms
}{
$
unique_
id
};
}
}
...
@@ -636,10 +1072,54 @@ sub translation_types{
...
@@ -636,10 +1072,54 @@ sub translation_types{
return
$self
->
{
_tl_types
};
return
$self
->
{
_tl_types
};
}
}
1
;
1
;
# Drosophila v5.3 : xrefs
# Gff_file external_db_id db_name
# ==
# Affymetrix 3120 AFFY_DrosGenome1
# DGRC-1 830 DGRC-1
# DGRC-2 831 DGRC-2
# DRSC 840 DRSC
# EPD 10100 EPD
# FlyBase 800 flybase_gene_id
# FlyBase_Annotation_IDs 804 flybase_annotation_id
# FlyReg 850 FlyReg
# GB 700 EMBL
# GB_protein 1700 protein_id
# GCR 10200 GPCR
# GI 10900 GI
# GO 1000 GO
# GenomeRNAi 860 GenomeRNAi
# INTERPRO 1200 Interpro
# MEROPS 10300 MEROPS
# MIR 10400 miRBase
# MITODROME 870 MitoDrome
# NRL_3D 1600 PDB
# PDB 1600 PDB
# Rfam 4200 RFAM
# TF 10500 TransFac
# UniProt/Swiss-Prot 2200 Uniprot/SWISSPROT
# UniProt/TrEMBL 2000 Uniprot/SPTREMBL
# bdgpinsituexpr 880 BDGP_insitu_expr
# dedb 890 DEDb
# drosdel 881 DrosDel
# flygrid 882 FlyGrid
# hybrigenics 883 hybrigenics
# if 884 InteractiveFly
# ensAG 6600 Ens_Ag_gene # Anopheles gambiae
# ensAM 6630 Ens_Am_gene # apis mellifera?
# ensCE 6660 Ens_Ce_gene # C Elegans
# ensCF 5700 Ens_Cf_gene # Canis familiaris
# ensDM 6690 Ens_Dm_gene #
# ensDR 5800 Ens_Dr_gene # Danio rerio
# ensFR 6720 Ens_Fr_gene # Takifugu rubripes
# ensGG 6400 Ens_Gg_gene # Gallus gallus
# ensHS 2700 Ens_Hs_gene # Homo sapiens
# ensMM 5000 Ens_Mm_gene # mus musculus
# ensPT 6750 Ens_Pt_gene # Pan troglodytes
# ensRN 6200 Ens_Rn_gene # Rattus norvegicus
# ensTN 6810 Ens_Tn_gene # Tetraodon nigroviridis
# modCB 10600 modCB # InParanoid Model organism database, Caenorhabditis briggsae
# modCE 10700 modCE # Caenorhabditis elegans
# modDD 10800 modDD # Dictyostelium discoideum
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment