From 39f0a770ac136e6c9f6e935739844f5cb604021d Mon Sep 17 00:00:00 2001
From: Magali Ruffier <mr6@ebi.ac.uk>
Date: Thu, 2 Oct 2014 16:01:30 +0100
Subject: [PATCH] first pass CGNCParser for chicken names recovered from cvs
 commits in 2012, not tested

---
 .../xref_mapping/XrefMapper/gallus_gallus.pm  | 66 +++++++++++++++++++
 .../xref_mapping/XrefParser/CGNCParser.pm     | 65 ++++++++++++++++++
 misc-scripts/xref_mapping/xref_config.ini     | 13 ++++
 3 files changed, 144 insertions(+)
 create mode 100644 misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm
 create mode 100644 misc-scripts/xref_mapping/XrefParser/CGNCParser.pm

diff --git a/misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm b/misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm
new file mode 100644
index 0000000000..b0ca583cd3
--- /dev/null
+++ b/misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm
@@ -0,0 +1,66 @@
+package XrefMapper::gallus_gallus;
+
+use  XrefMapper::BasicMapper;
+
+use vars '@ISA';
+
+@ISA = qw{ XrefMapper::BasicMapper };
+use strict;
+
+
+sub gene_display_xref_sources {
+  my $self     = shift;
+
+  my @list = qw(CGNC
+                RFAM
+                miRBase
+                Uniprot_gn
+                EntrezGene);
+
+  my %ignore;
+
+  #don't use EntrezGene labels dependent on predicted RefSeqs
+
+$ignore{'EntrezGene'} =<<IEG;
+SELECT DISTINCT ox.object_xref_id
+  FROM object_xref ox, dependent_xref dx, 
+       xref xmas, xref xdep, 
+       source smas, source sdep
+    WHERE ox.xref_id = dx.dependent_xref_id AND
+          dx.dependent_xref_id = xdep.xref_id AND
+          dx.master_xref_id = xmas.xref_id AND
+          xmas.source_id = smas.source_id AND
+          xdep.source_id = sdep.source_id AND
+          smas.name like "Refseq%predicted" AND
+          sdep.name like "EntrezGene" AND
+          ox.ox_status = "DUMP_OUT"      
+IEG
+
+  #don't use labels starting with LOC
+
+$ignore{'LOC_prefix'} =<<LOCP;
+SELECT object_xref_id
+  FROM object_xref JOIN xref USING(xref_id) JOIN source USING(source_id)
+   WHERE ox_status = 'DUMP_OUT' AND label REGEXP '^LOC[[:digit:]]+'
+LOCP
+
+  return [\@list,\%ignore];
+
+}
+
+
+sub gene_description_sources {
+
+  return ("miRBase",
+	  "RFAM", 
+	  "CGNC",
+	  "Uniprot/SWISSPROT", 
+	  "Uniprot/Varsplic", 
+	  "RefSeq_peptide", 
+	  "RefSeq_mRNA", 	  
+	  "Uniprot/SPTREMBL" );
+
+}
+
+
+1;
diff --git a/misc-scripts/xref_mapping/XrefParser/CGNCParser.pm b/misc-scripts/xref_mapping/XrefParser/CGNCParser.pm
new file mode 100644
index 0000000000..818b28806a
--- /dev/null
+++ b/misc-scripts/xref_mapping/XrefParser/CGNCParser.pm
@@ -0,0 +1,65 @@
+package XrefParser::CGNCParser;
+
+use strict;
+use warnings;
+use Carp;
+use DBI;
+
+use base qw(XrefParser::BaseParser);
+
+sub run {
+
+  my ($self, $ref_arg) = @_;
+  my $source_id    = $ref_arg->{source_id};
+  my $species_id   = $ref_arg->{species_id};
+  my $files        = $ref_arg->{files};
+  my $verbose      = $ref_arg->{verbose};
+
+  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
+    croak "Need to pass source_id, species_id and files as pairs";
+  }
+  $verbose |=0;
+
+  my $file = @{$files}[0];
+
+  my $file_io = $self->get_filehandle($file);
+  if ( !defined $file_io ) {
+    print STDERR "ERROR: Could not open $file\n";
+    return 1;    # 1 is an error
+  }
+
+  $source_id = $self->get_source_id_for_source_name("CGNC");
+
+  
+  my $count = 0;
+  while ( $_ = $file_io->getline() ) {
+#48941  ENSGALG00000002652      FZD10
+
+    chomp;
+    my @array = split /\t/x, $_;
+
+    my $ensid = $array[1];
+    my $acc = $array[2];
+    my $desc = $array[3];
+
+    if($ensid =~ /ENSGAL/){
+      my $xref_id = $self->add_xref({ acc        => $acc,
+				      version    => 0,
+				      label      => $acc,
+				      desc       => $desc,
+				      source_id  => $source_id,
+				      species_id => $species_id,
+				      info_type  => "DIRECT"} );
+
+      $self->add_direct_xref( $xref_id, $ensid, "Gene", '');
+      $count++;
+    } else{
+      print STDERR "No match for $acc\n";
+    }
+  }
+  print "$count direct CGNC xrefs added\n";
+  return 0;
+
+}
+
+1;
diff --git a/misc-scripts/xref_mapping/xref_config.ini b/misc-scripts/xref_mapping/xref_config.ini
index 623927544d..d1ec7b52b6 100644
--- a/misc-scripts/xref_mapping/xref_config.ini
+++ b/misc-scripts/xref_mapping/xref_config.ini
@@ -1129,6 +1129,18 @@ parser          = HGNCParser
 release_uri     =
 data_uri        =
 
+[source CGNC::gallus_gallus]
+# Used by chicken
+name            = CGNC
+download        = Y
+order           = 30
+priority        = 1
+prio_descr      = cgnc
+parser          = CGNCParser
+release_uri     =
+data_uri        = script:wget=>http://birdgenenames.org/cgnc/downloads.jsp,
+
+
 [source HGNC::homo_sapiens#01]
 # Used by homo_sapiens
 name            = HGNC
@@ -6050,6 +6062,7 @@ source          = UniParc::MULTI
 source          = RFAM::MULTI
 source		= miRBase::MULTI
 source 		= ArrayExpress::MULTI
+source          = CGNC::gallus_gallus
 
 [species gasterosteus_aculeatus]
 taxonomy_id     = 69293
-- 
GitLab