diff --git a/misc-scripts/xref_mapping/XrefParser/HGNC_ENSGParser.pm b/misc-scripts/xref_mapping/XrefParser/HGNC_ENSGParser.pm deleted file mode 100644 index 32d0fe9ca5f8334ddb8d4dbf4844570b203ed97a..0000000000000000000000000000000000000000 --- a/misc-scripts/xref_mapping/XrefParser/HGNC_ENSGParser.pm +++ /dev/null @@ -1,108 +0,0 @@ -package XrefParser::HGNC_ENSGParser; - -use strict; - -use DBI; - -use base qw( XrefParser::BaseParser ); - -# Parse file of HGNC records and assign direct xrefs -# All assumed to be linked to genes - -sub run { - - my ($self, $source_id, $species_id, $file) = @_; - - my $hugo_io = $self->get_filehandle($file); - - if ( !defined $hugo_io ) { - print "Could not open $file\n"; - return 1; - } - - my $line_count = 0; - my $xref_count = 0; - - # becouse the direct mapping have no descriptions etc - # we have to steal these fromt he previous HGNC parser. - # This is why the order states this is after the other one. - # maybe 1091,1092 is not right maybe should use name = HGNC and priority = 30r4 ?? - - my %label; - my %version; - my %description; - - my $dbi = $self->dbi(); - - #get the source ids for HGNC refseq, entrezgene and unitprot - my $sql = 'select source_id, priority_description from source where name like "HGNC"'; - my $sth = $dbi->prepare($sql); - - $sth->execute(); - my ($hgnc_source_id, $desc); - $sth->bind_columns(\$hgnc_source_id, \$desc); - my @arr; - while($sth->fetch()){ - push @arr, $hgnc_source_id; - } - $sth->finish; - - $sql = "select accession, label, version, description from xref where source_id in (".join(", ",@arr).")"; - print "$sql\n";; - $sth = $dbi->prepare($sql); - $sth->execute(); - my ($acc, $lab, $ver); - my $hgnc_loaded_count = 0; - $sth->bind_columns(\$acc, \$lab, \$ver, \$desc); - while (my @row = $sth->fetchrow_array()) { - $label{$acc} = $lab; - $version{$acc} = $ver; - $description{$acc} = $desc; - $hgnc_loaded_count++; - } - $sth->finish; - if($hgnc_loaded_count == 0){ - die "No point continuing no hgncs there\n"; - } - - print "$hgnc_loaded_count HGNC's to be used as labels\n"; - - my $ignore_count = 0; - my $ignore_examples =""; - my %acc; - - while ( $_ = $hugo_io->getline() ) { - - my ($hgnc, $stable_id) = split; - - if(!defined($label{$hgnc})){ - $ignore_count++; - if($ignore_count < 10){ - $ignore_examples .= " ".$hgnc; - } - next; - } - if(!defined($acc{$hgnc})){ - $acc{$hgnc} = 1; - my $version =""; - $line_count++; - - my $xref_id = $self->add_xref($hgnc, $version{$hgnc} , $label{$hgnc}||$hgnc , $description{$hgnc}, $source_id, $species_id, "DIRECT"); - $xref_count++; - - - $self->add_direct_xref($xref_id, $stable_id, "gene", ""); - } - } - - print "Parsed $line_count HGNC identifiers from $file, added $xref_count xrefs and $line_count direct_xrefs\n"; - if($ignore_count){ - print $ignore_count." ignoreed due to numbers no identifiers being no longer valid :- $ignore_examples\n"; - } - - $hugo_io->close(); - - return 0; -} - -1;