Skip to content
Snippets Groups Projects
Commit fe90ff9e authored by Ian Longden's avatar Ian Longden
Browse files

HUGO -> HGNC

parent ee0c8cb9
No related branches found
No related tags found
No related merge requests found
package XrefParser::HGNCParser;
use strict;
use File::Basename;
use base qw( XrefParser::BaseParser );
my $xref_sth ;
my $dep_sth;
my $syn_sth;
# --------------------------------------------------------------------------------
# Parse command line and run if being run directly
if (!defined(caller())) {
if (scalar(@ARGV) != 1) {
print "\nUsage: HGNCParser.pm file <source_id> <species_id>\n\n";
exit(1);
}
run(@ARGV);
}
sub run {
my $self = shift if (defined(caller(1)));
my $source_id = shift;
my $species_id = shift;
my $file = shift;
print STDERR "source = $source_id\tspecies = $species_id\n";
if(!defined($source_id)){
$source_id = XrefParser::BaseParser->get_source_id_for_filename($file);
}
if(!defined($species_id)){
$species_id = XrefParser::BaseParser->get_species_id_for_filename($file);
}
my $hgnc_refseq_manual = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","refseq_manual");
if(!defined($hgnc_refseq_manual)){
die "Could not get source id for HGNC with priority description of refseq_manual\n";
}
my $hgnc_refseq_mapped = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","refseq_mapped");
if(!defined($hgnc_refseq_mapped)){
die "Could not get source id for HGNC with priority description of refseq_mapped\n";
}
my $hgnc_entrezgene_manual = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","entrezgene_manual");
if(!defined($hgnc_entrezgene_manual)){
die "Could not get source id for HGNC with priority description of entrezgene_manual\n";
}
my $hgnc_entrezgene_mapped = XrefParser::BaseParser->get_source_id_for_source_name("HGNC","entrezgene_mapped");
if(!defined($hgnc_entrezgene_mapped)){
die "Could not get source id for HGNC with priority description of entrezgene_mapped\n";
}
# my (%swiss) = %{XrefParser::BaseParser->get_valid_codes("uniprot",$species_id)};
my (%refseq) = %{XrefParser::BaseParser->get_valid_codes("refseq",$species_id)};
my @list;
push @list, "refseq_peptide";
push @list, "refseq_dna";
my (%entrezgene) = %{XrefParser::BaseParser->get_valid_xrefs_for_dependencies("EntrezGene",@list)};
my $swiss_count = 0;
my $refseq_count = 0;
my $entrezgene_count = 0;
my $mismatch = 0;
my $hugo_io = $self->get_filehandle($file);
if ( !defined $hugo_io ) {
print "ERROR: Can't open HGNC file $file\n";
return 1;
}
$_ = $hugo_io->getline();
while ( $_ = $hugo_io->getline() ) {
chomp;
# 0 HGNC ID # primary accession
# 1 Approved Symbol # label
# 2 Approved Name # description
# 3 Previous Symbols # synonyms
# 4 Aliases # aliases
# 5 entrezgene ID manually curated
# 6 RefSeq ID manually curated
# 7 entrezgene ID mapped
# 8 RefSeq ID mapped
my @array = split(/\t/,$_);
# Use the RefSeq if available as this is manually curated
# If no RefSeq, use the Swissprot instead
my $seen = 0;
if ($array[6]) { # RefSeq
if(defined($refseq{$array[6]})){
$seen = 1;
$refseq_count++;
XrefParser::BaseParser->add_to_xrefs($refseq{$array[6]}, $array[0], '', $array[1], $array[2], "", $hgnc_refseq_manual, $species_id);
if (defined($array[3])) { # dead name, add to synonym
my @array2 = split(',\s*', $array[3]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr);
}
}
if (defined($array[4])) { # alias, add to synonym
my @array2 = split(',\s*', $array[4]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_manual, $arr);
}
}
}
}
if ($array[8]) { # RefSeq
if(defined($refseq{$array[8]})){
$seen = 1;
$refseq_count++;
XrefParser::BaseParser->add_to_xrefs($refseq{$array[8]}, $array[0], '', $array[1], $array[2], "", $hgnc_refseq_mapped, $species_id);
if (defined($array[3])) { # dead name, add to synonym
my @array2 = split(',\s*', $array[3]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr);
}
}
if (defined($array[4])) { # alias, add to synonym
my @array2 = split(',\s*', $array[4]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_refseq_mapped, $arr);
}
}
}
}
if(defined($array[5])){
if(defined($entrezgene{$array[5]})){
$seen = 1;
XrefParser::BaseParser->add_to_xrefs($entrezgene{$array[5]}, $array[0], '',
$array[1], $array[2], "", $hgnc_entrezgene_manual, $species_id);
$entrezgene_count++;
if (defined($array[3])) { # dead name, add to synonym
my @array2 = split(',\s*', $array[3]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr);
}
}
if (defined($array[4])) { # alias, add to synonym
my @array2 = split(',\s*', $array[4]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_manual, $arr);
}
}
}
}
if(defined($array[7])){
if(defined($entrezgene{$array[7]})){
$seen = 1;
XrefParser::BaseParser->add_to_xrefs($entrezgene{$array[7]}, $array[0], '',
$array[1], $array[2], "", $hgnc_entrezgene_mapped, $species_id);
$entrezgene_count++;
if (defined($array[3])) { # dead name, add to synonym
my @array2 = split(',\s*', $array[3]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr);
}
}
if (defined($array[4])) { # alias, add to synonym
my @array2 = split(',\s*', $array[4]);
foreach my $arr (@array2){
XrefParser::BaseParser->add_to_syn($array[0], $hgnc_entrezgene_mapped, $arr);
}
}
}
}
if(!$seen){ # Store to keep descriptions etc
$self->add_xref($array[0], "", $array[1], $array[2], $source_id, $species_id);
}
} # while HGNC
$hugo_io->getline();
print "Loaded a total of " . ($refseq_count + $entrezgene_count) . " HGNC xrefs, $refseq_count from RefSeq curated mappings and $entrezgene_count from EntrezGene mappings\n";
print "$mismatch xrefs could not be associated via RefSeq or EntrezGene\n";
return 0; # successful
}
sub rename_url_file{
return "hugo.txt";
}
1;
package XrefParser::HGNC_CCDSParser;
use strict;
use DBI;
use base qw( XrefParser::BaseParser );
# Parse file of HGNC records and assign direct xrefs
# All assumed to be linked to genes
sub run {
my ($self, $source_id, $species_id, $file) = @_;
my $hugo_io = $self->get_filehandle($file);
if ( !defined $hugo_io ) {
print "Could not open $file\n";
return 1;
}
# becouse the direct mapping have no descriptions etc
# we have to steal these fromt he previous HGNC parser.
# This is why the order states this is after the other one.
# maybe 1091,1092 is not right maybe should use name = HGNC and priority = 30r4 ??
my %label;
my %version;
my %description;
my $dbi = $self->dbi();
my $sql = "select accession, label, version, description from xref where source_id in (1091, 1092, 1094)";
my $sth = $dbi->prepare($sql);
$sth->execute();
my ($acc, $lab, $ver, $desc);
$sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
while (my @row = $sth->fetchrow_array()) {
$label{$acc} = $lab;
$version{$acc} = $ver;
$description{$acc} = $desc;
}
$sth->finish;
$sql = 'select x.accession, d.ensembl_stable_id, d.type
from xref x, direct_xref d, source s
where s.source_id = x.source_id and
x.xref_id = d.general_xref_id and s.name like "CCDS"';
$sth = $dbi->prepare($sql);
$sth->execute();
my ($access, $stable_id, $type);
$sth->bind_columns(\$access, \$stable_id, \$type);
my %ensembl_stable_id;
my %ensembl_type;
while (my @row = $sth->fetchrow_array()) {
$ensembl_stable_id{$access} = $stable_id;
$ensembl_type{$access} = $type;
}
$sth->finish;
my $line_count = 0;
my $xref_count = 0;
my %seen;
my $ignore_count = 0;
my $ignore_examples ="";
while( $_ = $hugo_io->getline() ) {
chomp;
my ($ccds,$hgnc) = split;
$line_count++;
if(!defined($label{$hgnc})){
$ignore_count++;
if($ignore_count < 10){
$ignore_examples .= " ".$hgnc;
}
next;
}
if(!defined($seen{$hgnc})){
$seen{$hgnc} = 1;
my $key = "CCDS".$ccds;
if(defined($ensembl_stable_id{$key})){
my $xref_id = $self->add_xref($hgnc, $version{$hgnc} , $label{$hgnc}||$hgnc ,
$description{$hgnc}, $source_id, $species_id);
$self->add_direct_xref($xref_id, $ensembl_stable_id{$key}, $ensembl_type{$key}, "");
$xref_count++;
}
}
}
print "Parsed $line_count HGNC identifiers from $file, added $xref_count xrefs and $xref_count direct_xrefs from $line_count lines.\n";
if($ignore_count){
print $ignore_count." ignoreed due to numbers no identifiers being no longer valid :- $ignore_examples\n";
}
$hugo_io->close();
return 0;
}
1;
package XrefParser::HGNC_ENSGParser;
use strict;
use DBI;
use base qw( XrefParser::BaseParser );
# Parse file of HGNC records and assign direct xrefs
# All assumed to be linked to genes
sub run {
my ($self, $source_id, $species_id, $file) = @_;
my $hugo_io = $self->get_filehandle($file);
if ( !defined $hugo_io ) {
print "Could not open $file\n";
return 1;
}
my $line_count = 0;
my $xref_count = 0;
# becouse the direct mapping have no descriptions etc
# we have to steal these fromt he previous HGNC parser.
# This is why the order states this is after the other one.
# maybe 1091,1092 is not right maybe should use name = HGNC and priority = 30r4 ??
my %label;
my %version;
my %description;
my $dbi = $self->dbi();
#get the source ids for HGNC refseq, entrezgene and unitprot
my $sql = 'select source_id, priority_description from source where name like "HGNC"';
my $sth = $dbi->prepare($sql);
$sth->execute();
my ($hgnc_source_id, $desc);
$sth->bind_columns(\$hgnc_source_id, \$desc);
my @arr;
while($sth->fetch()){
if(lc($desc) eq "refseq" or lc($desc) eq "uniprot" or lc($desc) eq "entrezgene"){
push @arr, $hgnc_source_id;
}
}
$sth->finish;
$sql = "select accession, label, version, description from xref where source_id in (".join(", ",@arr).")";
$sth = $dbi->prepare($sql);
$sth->execute();
my ($acc, $lab, $ver);
$sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
while (my @row = $sth->fetchrow_array()) {
$label{$acc} = $lab;
$version{$acc} = $ver;
$description{$acc} = $desc;
}
$sth->finish;
my $ignore_count = 0;
my $ignore_examples ="";
my %acc;
while ( $_ = $hugo_io->getline() ) {
my ($hgnc, $stable_id) = split;
if(!defined($label{$hgnc})){
$ignore_count++;
if($ignore_count < 10){
$ignore_examples .= " ".$hgnc;
}
next;
}
if(!defined($acc{$hgnc})){
$acc{$hgnc} = 1;
my $version ="";
$line_count++;
my $xref_id = $self->add_xref($hgnc, $version{$hgnc} , $label{$hgnc}||$hgnc , $description{$hgnc}, $source_id, $species_id);
$xref_count++;
$self->add_direct_xref($xref_id, $stable_id, "gene", "");
}
}
print "Parsed $line_count HGNC identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n";
if($ignore_count){
print $ignore_count." ignoreed due to numbers no identifiers being no longer valid :- $ignore_examples\n";
}
$hugo_io->close();
return 0;
}
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment