Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Open sidebar
ensembl-gh-mirror
ensembl
Commits
fe90ff9e
Commit
fe90ff9e
authored
Jan 29, 2008
by
Ian Longden
Browse files
HUGO -> HGNC
parent
ee0c8cb9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
418 additions
and
0 deletions
+418
-0
misc-scripts/xref_mapping/XrefParser/HGNCParser.pm
misc-scripts/xref_mapping/XrefParser/HGNCParser.pm
+210
-0
misc-scripts/xref_mapping/XrefParser/HGNC_CCDSParser.pm
misc-scripts/xref_mapping/XrefParser/HGNC_CCDSParser.pm
+105
-0
misc-scripts/xref_mapping/XrefParser/HGNC_ENSGParser.pm
misc-scripts/xref_mapping/XrefParser/HGNC_ENSGParser.pm
+103
-0
No files found.
misc-scripts/xref_mapping/XrefParser/HGNCParser.pm
0 → 100644
View file @
fe90ff9e
package
XrefParser::
HGNCParser
;
use
strict
;
use
File::
Basename
;
use
base
qw( XrefParser::BaseParser )
;
my
$xref_sth
;
my
$dep_sth
;
my
$syn_sth
;
# --------------------------------------------------------------------------------
# Parse command line and run if being run directly
if
(
!
defined
(
caller
()))
{
if
(
scalar
(
@ARGV
)
!=
1
)
{
print
"
\n
Usage: HGNCParser.pm file <source_id> <species_id>
\n\n
";
exit
(
1
);
}
run
(
@ARGV
);
}
sub
run
{
my
$self
=
shift
if
(
defined
(
caller
(
1
)));
my
$source_id
=
shift
;
my
$species_id
=
shift
;
my
$file
=
shift
;
print
STDERR
"
source =
$source_id
\t
species =
$species_id
\n
";
if
(
!
defined
(
$source_id
)){
$source_id
=
XrefParser::
BaseParser
->
get_source_id_for_filename
(
$file
);
}
if
(
!
defined
(
$species_id
)){
$species_id
=
XrefParser::
BaseParser
->
get_species_id_for_filename
(
$file
);
}
my
$hgnc_refseq_manual
=
XrefParser::
BaseParser
->
get_source_id_for_source_name
("
HGNC
","
refseq_manual
");
if
(
!
defined
(
$hgnc_refseq_manual
)){
die
"
Could not get source id for HGNC with priority description of refseq_manual
\n
";
}
my
$hgnc_refseq_mapped
=
XrefParser::
BaseParser
->
get_source_id_for_source_name
("
HGNC
","
refseq_mapped
");
if
(
!
defined
(
$hgnc_refseq_mapped
)){
die
"
Could not get source id for HGNC with priority description of refseq_mapped
\n
";
}
my
$hgnc_entrezgene_manual
=
XrefParser::
BaseParser
->
get_source_id_for_source_name
("
HGNC
","
entrezgene_manual
");
if
(
!
defined
(
$hgnc_entrezgene_manual
)){
die
"
Could not get source id for HGNC with priority description of entrezgene_manual
\n
";
}
my
$hgnc_entrezgene_mapped
=
XrefParser::
BaseParser
->
get_source_id_for_source_name
("
HGNC
","
entrezgene_mapped
");
if
(
!
defined
(
$hgnc_entrezgene_mapped
)){
die
"
Could not get source id for HGNC with priority description of entrezgene_mapped
\n
";
}
# my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)};
my
(
%refseq
)
=
%
{
XrefParser::
BaseParser
->
get_valid_codes
("
refseq
",
$species_id
)};
my
@list
;
push
@list
,
"
refseq_peptide
";
push
@list
,
"
refseq_dna
";
my
(
%entrezgene
)
=
%
{
XrefParser::
BaseParser
->
get_valid_xrefs_for_dependencies
("
EntrezGene
",
@list
)};
my
$swiss_count
=
0
;
my
$refseq_count
=
0
;
my
$entrezgene_count
=
0
;
my
$mismatch
=
0
;
my
$hugo_io
=
$self
->
get_filehandle
(
$file
);
if
(
!
defined
$hugo_io
)
{
print
"
ERROR: Can't open HGNC file
$file
\n
";
return
1
;
}
$_
=
$hugo_io
->
getline
();
while
(
$_
=
$hugo_io
->
getline
()
)
{
chomp
;
# 0 HGNC ID # primary accession
# 1 Approved Symbol # label
# 2 Approved Name # description
# 3 Previous Symbols # synonyms
# 4 Aliases # aliases
# 5 entrezgene ID manually curated
# 6 RefSeq ID manually curated
# 7 entrezgene ID mapped
# 8 RefSeq ID mapped
my
@array
=
split
(
/\t/
,
$_
);
# Use the RefSeq if available as this is manually curated
# If no RefSeq, use the Swissprot instead
my
$seen
=
0
;
if
(
$array
[
6
])
{
# RefSeq
if
(
defined
(
$refseq
{
$array
[
6
]})){
$seen
=
1
;
$refseq_count
++
;
XrefParser::
BaseParser
->
add_to_xrefs
(
$refseq
{
$array
[
6
]},
$array
[
0
],
'',
$array
[
1
],
$array
[
2
],
"",
$hgnc_refseq_manual
,
$species_id
);
if
(
defined
(
$array
[
3
]))
{
# dead name, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
3
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_refseq_manual
,
$arr
);
}
}
if
(
defined
(
$array
[
4
]))
{
# alias, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
4
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_refseq_manual
,
$arr
);
}
}
}
}
if
(
$array
[
8
])
{
# RefSeq
if
(
defined
(
$refseq
{
$array
[
8
]})){
$seen
=
1
;
$refseq_count
++
;
XrefParser::
BaseParser
->
add_to_xrefs
(
$refseq
{
$array
[
8
]},
$array
[
0
],
'',
$array
[
1
],
$array
[
2
],
"",
$hgnc_refseq_mapped
,
$species_id
);
if
(
defined
(
$array
[
3
]))
{
# dead name, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
3
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_refseq_mapped
,
$arr
);
}
}
if
(
defined
(
$array
[
4
]))
{
# alias, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
4
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_refseq_mapped
,
$arr
);
}
}
}
}
if
(
defined
(
$array
[
5
])){
if
(
defined
(
$entrezgene
{
$array
[
5
]})){
$seen
=
1
;
XrefParser::
BaseParser
->
add_to_xrefs
(
$entrezgene
{
$array
[
5
]},
$array
[
0
],
'',
$array
[
1
],
$array
[
2
],
"",
$hgnc_entrezgene_manual
,
$species_id
);
$entrezgene_count
++
;
if
(
defined
(
$array
[
3
]))
{
# dead name, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
3
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_entrezgene_manual
,
$arr
);
}
}
if
(
defined
(
$array
[
4
]))
{
# alias, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
4
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_entrezgene_manual
,
$arr
);
}
}
}
}
if
(
defined
(
$array
[
7
])){
if
(
defined
(
$entrezgene
{
$array
[
7
]})){
$seen
=
1
;
XrefParser::
BaseParser
->
add_to_xrefs
(
$entrezgene
{
$array
[
7
]},
$array
[
0
],
'',
$array
[
1
],
$array
[
2
],
"",
$hgnc_entrezgene_mapped
,
$species_id
);
$entrezgene_count
++
;
if
(
defined
(
$array
[
3
]))
{
# dead name, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
3
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_entrezgene_mapped
,
$arr
);
}
}
if
(
defined
(
$array
[
4
]))
{
# alias, add to synonym
my
@array2
=
split
('
,\s*
',
$array
[
4
]);
foreach
my
$arr
(
@array2
){
XrefParser::
BaseParser
->
add_to_syn
(
$array
[
0
],
$hgnc_entrezgene_mapped
,
$arr
);
}
}
}
}
if
(
!
$seen
){
# Store to keep descriptions etc
$self
->
add_xref
(
$array
[
0
],
"",
$array
[
1
],
$array
[
2
],
$source_id
,
$species_id
);
}
}
# while HGNC
$hugo_io
->
getline
();
print
"
Loaded a total of
"
.
(
$refseq_count
+
$entrezgene_count
)
.
"
HGNC xrefs,
$refseq_count
from RefSeq curated mappings and
$entrezgene_count
from EntrezGene mappings
\n
";
print
"
$mismatch
xrefs could not be associated via RefSeq or EntrezGene
\n
";
return
0
;
# successful
}
sub
rename_url_file
{
return
"
hugo.txt
";
}
1
;
misc-scripts/xref_mapping/XrefParser/HGNC_CCDSParser.pm
0 → 100644
View file @
fe90ff9e
package
XrefParser::
HGNC_CCDSParser
;
use
strict
;
use
DBI
;
use
base
qw( XrefParser::BaseParser )
;
# Parse file of HGNC records and assign direct xrefs
# All assumed to be linked to genes
sub
run
{
my
(
$self
,
$source_id
,
$species_id
,
$file
)
=
@_
;
my
$hugo_io
=
$self
->
get_filehandle
(
$file
);
if
(
!
defined
$hugo_io
)
{
print
"
Could not open
$file
\n
";
return
1
;
}
# becouse the direct mapping have no descriptions etc
# we have to steal these fromt he previous HGNC parser.
# This is why the order states this is after the other one.
# maybe 1091,1092 is not right maybe should use name = HGNC and priority = 30r4 ??
my
%label
;
my
%version
;
my
%description
;
my
$dbi
=
$self
->
dbi
();
my
$sql
=
"
select accession, label, version, description from xref where source_id in (1091, 1092, 1094)
";
my
$sth
=
$dbi
->
prepare
(
$sql
);
$sth
->
execute
();
my
(
$acc
,
$lab
,
$ver
,
$desc
);
$sth
->
bind_columns
(
\
$acc
,
\
$lab
,
\
$ver
,
\
$desc
);
while
(
my
@row
=
$sth
->
fetchrow_array
())
{
$label
{
$acc
}
=
$lab
;
$version
{
$acc
}
=
$ver
;
$description
{
$acc
}
=
$desc
;
}
$sth
->
finish
;
$sql
=
'
select x.accession, d.ensembl_stable_id, d.type
from xref x, direct_xref d, source s
where s.source_id = x.source_id and
x.xref_id = d.general_xref_id and s.name like "CCDS"
';
$sth
=
$dbi
->
prepare
(
$sql
);
$sth
->
execute
();
my
(
$access
,
$stable_id
,
$type
);
$sth
->
bind_columns
(
\
$access
,
\
$stable_id
,
\
$type
);
my
%ensembl_stable_id
;
my
%ensembl_type
;
while
(
my
@row
=
$sth
->
fetchrow_array
())
{
$ensembl_stable_id
{
$access
}
=
$stable_id
;
$ensembl_type
{
$access
}
=
$type
;
}
$sth
->
finish
;
my
$line_count
=
0
;
my
$xref_count
=
0
;
my
%seen
;
my
$ignore_count
=
0
;
my
$ignore_examples
=
"";
while
(
$_
=
$hugo_io
->
getline
()
)
{
chomp
;
my
(
$ccds
,
$hgnc
)
=
split
;
$line_count
++
;
if
(
!
defined
(
$label
{
$hgnc
})){
$ignore_count
++
;
if
(
$ignore_count
<
10
){
$ignore_examples
.=
"
"
.
$hgnc
;
}
next
;
}
if
(
!
defined
(
$seen
{
$hgnc
})){
$seen
{
$hgnc
}
=
1
;
my
$key
=
"
CCDS
"
.
$ccds
;
if
(
defined
(
$ensembl_stable_id
{
$key
})){
my
$xref_id
=
$self
->
add_xref
(
$hgnc
,
$version
{
$hgnc
}
,
$label
{
$hgnc
}
||
$hgnc
,
$description
{
$hgnc
},
$source_id
,
$species_id
);
$self
->
add_direct_xref
(
$xref_id
,
$ensembl_stable_id
{
$key
},
$ensembl_type
{
$key
},
"");
$xref_count
++
;
}
}
}
print
"
Parsed
$line_count
HGNC identifiers from
$file
, added
$xref_count
xrefs and
$xref_count
direct_xrefs from
$line_count
lines.
\n
";
if
(
$ignore_count
){
print
$ignore_count
.
"
ignoreed due to numbers no identifiers being no longer valid :-
$ignore_examples
\n
";
}
$hugo_io
->
close
();
return
0
;
}
1
;
misc-scripts/xref_mapping/XrefParser/HGNC_ENSGParser.pm
0 → 100644
View file @
fe90ff9e
package
XrefParser::
HGNC_ENSGParser
;
use
strict
;
use
DBI
;
use
base
qw( XrefParser::BaseParser )
;
# Parse file of HGNC records and assign direct xrefs
# All assumed to be linked to genes
sub
run
{
my
(
$self
,
$source_id
,
$species_id
,
$file
)
=
@_
;
my
$hugo_io
=
$self
->
get_filehandle
(
$file
);
if
(
!
defined
$hugo_io
)
{
print
"
Could not open
$file
\n
";
return
1
;
}
my
$line_count
=
0
;
my
$xref_count
=
0
;
# becouse the direct mapping have no descriptions etc
# we have to steal these fromt he previous HGNC parser.
# This is why the order states this is after the other one.
# maybe 1091,1092 is not right maybe should use name = HGNC and priority = 30r4 ??
my
%label
;
my
%version
;
my
%description
;
my
$dbi
=
$self
->
dbi
();
#get the source ids for HGNC refseq, entrezgene and unitprot
my
$sql
=
'
select source_id, priority_description from source where name like "HGNC"
';
my
$sth
=
$dbi
->
prepare
(
$sql
);
$sth
->
execute
();
my
(
$hgnc_source_id
,
$desc
);
$sth
->
bind_columns
(
\
$hgnc_source_id
,
\
$desc
);
my
@arr
;
while
(
$sth
->
fetch
()){
if
(
lc
(
$desc
)
eq
"
refseq
"
or
lc
(
$desc
)
eq
"
uniprot
"
or
lc
(
$desc
)
eq
"
entrezgene
"){
push
@arr
,
$hgnc_source_id
;
}
}
$sth
->
finish
;
$sql
=
"
select accession, label, version, description from xref where source_id in (
"
.
join
("
,
",
@arr
)
.
"
)
";
$sth
=
$dbi
->
prepare
(
$sql
);
$sth
->
execute
();
my
(
$acc
,
$lab
,
$ver
);
$sth
->
bind_columns
(
\
$acc
,
\
$lab
,
\
$ver
,
\
$desc
);
while
(
my
@row
=
$sth
->
fetchrow_array
())
{
$label
{
$acc
}
=
$lab
;
$version
{
$acc
}
=
$ver
;
$description
{
$acc
}
=
$desc
;
}
$sth
->
finish
;
my
$ignore_count
=
0
;
my
$ignore_examples
=
"";
my
%acc
;
while
(
$_
=
$hugo_io
->
getline
()
)
{
my
(
$hgnc
,
$stable_id
)
=
split
;
if
(
!
defined
(
$label
{
$hgnc
})){
$ignore_count
++
;
if
(
$ignore_count
<
10
){
$ignore_examples
.=
"
"
.
$hgnc
;
}
next
;
}
if
(
!
defined
(
$acc
{
$hgnc
})){
$acc
{
$hgnc
}
=
1
;
my
$version
=
"";
$line_count
++
;
my
$xref_id
=
$self
->
add_xref
(
$hgnc
,
$version
{
$hgnc
}
,
$label
{
$hgnc
}
||
$hgnc
,
$description
{
$hgnc
},
$source_id
,
$species_id
);
$xref_count
++
;
$self
->
add_direct_xref
(
$xref_id
,
$stable_id
,
"
gene
",
"");
}
}
print
"
Parsed
$line_count
HGNC identifiers from
$file
, added
$xref_count
xrefs and
$line_count
direct_xrefs
\n
";
if
(
$ignore_count
){
print
$ignore_count
.
"
ignoreed due to numbers no identifiers being no longer valid :-
$ignore_examples
\n
";
}
$hugo_io
->
close
();
return
0
;
}
1
;
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment