From 07a96f2abb300a26a95b7415ea7d954303302095 Mon Sep 17 00:00:00 2001
From: Monika Komorowska <mk8@sanger.ac.uk>
Date: Wed, 15 Feb 2012 14:30:48 +0000
Subject: [PATCH] new parser for Orphanet xrefs dependent on HGNC identifiers

---
 .../xref_mapping/XrefParser/OrphanetParser.pm | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 misc-scripts/xref_mapping/XrefParser/OrphanetParser.pm

diff --git a/misc-scripts/xref_mapping/XrefParser/OrphanetParser.pm b/misc-scripts/xref_mapping/XrefParser/OrphanetParser.pm
new file mode 100644
index 0000000000..520961be38
--- /dev/null
+++ b/misc-scripts/xref_mapping/XrefParser/OrphanetParser.pm
@@ -0,0 +1,131 @@
+package XrefParser::OrphanetParser;
+
+use strict;
+use warnings;
+use Carp;
+use File::Basename;
+
+use XML::LibXML;
+
+use base qw( XrefParser::BaseParser );
+
+sub run {
+
+
+  my ($self, $ref_arg) = @_;
+  my $source_id    = $ref_arg->{source_id};
+  my $species_id   = $ref_arg->{species_id};
+  my $files        = $ref_arg->{files};
+  my $verbose      = $ref_arg->{verbose};
+
+  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
+    croak "Need to pass source_id, species_id and file as pairs";
+  }
+  $verbose |=0;
+
+  my $xml_file = @{$files}[0];
+  
+  print STDERR "Orphanet file to parse, $xml_file\n" if($verbose);
+
+  my %gene_disorders; 
+
+  my $term = undef;
+  my $desc = undef;
+  
+  my $xml_parser = XML::LibXML->new();
+  my $orphanet_doc    = $xml_parser->parse_file($xml_file);
+
+  my ($jdbor_node) = $orphanet_doc->findnodes('JDBOR');
+  my $release = $jdbor_node->getAttribute('version');
+  # Set release
+  $self->set_release( $source_id,$release );
+
+
+  my $gene_disorder_count = 0;
+  my $disorder_count = 0;
+  
+  foreach my $disorder ($orphanet_doc->findnodes('JDBOR/DisorderList/Disorder')) {
+    my ($orpha_number_node) = $disorder->findnodes('./OrphaNumber');
+    my $orpha_number = $orpha_number_node->to_literal;
+    my ($name_node) = $disorder->findnodes('./Name');
+    my $name = $name_node->to_literal;
+
+    my @genes = $disorder->findnodes('./GeneList/Gene');
+
+    if ( scalar(@genes) > 0) {
+
+	$disorder_count++;
+    }
+
+    foreach my $gene (@genes) {
+
+	my $ref;
+	#get the HGNC xref
+	foreach my $external_reference_node ($gene->findnodes('./ExternalReferenceList/ExternalReference')) {
+	    my ($source_node) = $external_reference_node->findnodes('./Source');
+	    if ($source_node->to_literal =~ /HGNC/) {
+		my ($ref_node) = $external_reference_node->findnodes('./Reference');
+		$ref = $ref_node->to_literal;
+	    }
+	}
+	if (defined($ref)) {
+	    $gene_disorders{$ref}{$orpha_number} = $name;
+	    $gene_disorder_count++;
+	}
+    }
+    
+  }
+ 
+  print  "Parsed $disorder_count disorders\n";
+  print  "Found $gene_disorder_count genes associated with disorders\n";
+
+
+  #get the mapping that are already there so that we don't get lots of duplicates.
+  # stored in the global hash xref_dependent_mapped.
+  $self->get_dependent_mappings($source_id);
+
+
+  my (%hgnc) = %{$self->get_valid_codes("HGNC", $species_id)};
+
+  print "got " . scalar(keys (%hgnc)) . " HGNC entries\n";
+   
+  print "species_id, source_id: $species_id, $source_id\n";
+
+  my $added = 0;
+  my %Orphanet_xrefs;
+ 
+  foreach my $hgnc_acc (keys (%gene_disorders)) {
+    
+      # Get the master_xref_id
+    
+      if(!defined($hgnc{$hgnc_acc})){
+	print STDERR "failed to get the master_xref_if for HGNC, $hgnc_acc!\n";
+      }
+      else{
+	foreach my $master_xref_id (@{$hgnc{$hgnc_acc}}){
+	  
+	  foreach my $orpha_number (keys %{$gene_disorders{$hgnc_acc}}) {
+	   
+	    $self->add_dependent_xref({ master_xref_id => $master_xref_id,
+					acc            => $orpha_number,
+					source_id      => $source_id,
+					species_id     => $species_id,
+				        desc           => $gene_disorders{$hgnc_acc}{$orpha_number} });
+	    $Orphanet_xrefs{$orpha_number}++;
+	    $added++;
+	  }
+	}
+      }
+    
+  }
+
+  print "Added " . scalar(keys %Orphanet_xrefs) . " Orphanet xrefs and " . $added . " dependent xrefs\n";
+  if ($added > 0) {
+      return 0;
+  } else {
+      return 1;
+  }
+
+}
+
+1;
-- 
GitLab