From 39f0a770ac136e6c9f6e935739844f5cb604021d Mon Sep 17 00:00:00 2001 From: Magali Ruffier <mr6@ebi.ac.uk> Date: Thu, 2 Oct 2014 16:01:30 +0100 Subject: [PATCH] first pass CGNCParser for chicken names recovered from cvs commits in 2012, not tested --- .../xref_mapping/XrefMapper/gallus_gallus.pm | 66 +++++++++++++++++++ .../xref_mapping/XrefParser/CGNCParser.pm | 65 ++++++++++++++++++ misc-scripts/xref_mapping/xref_config.ini | 13 ++++ 3 files changed, 144 insertions(+) create mode 100644 misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm create mode 100644 misc-scripts/xref_mapping/XrefParser/CGNCParser.pm diff --git a/misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm b/misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm new file mode 100644 index 0000000000..b0ca583cd3 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefMapper/gallus_gallus.pm @@ -0,0 +1,66 @@ +package XrefMapper::gallus_gallus; + +use XrefMapper::BasicMapper; + +use vars '@ISA'; + +@ISA = qw{ XrefMapper::BasicMapper }; +use strict; + + +sub gene_display_xref_sources { + my $self = shift; + + my @list = qw(CGNC + RFAM + miRBase + Uniprot_gn + EntrezGene); + + my %ignore; + + #don't use EntrezGene labels dependent on predicted RefSeqs + +$ignore{'EntrezGene'} =<<IEG; +SELECT DISTINCT ox.object_xref_id + FROM object_xref ox, dependent_xref dx, + xref xmas, xref xdep, + source smas, source sdep + WHERE ox.xref_id = dx.dependent_xref_id AND + dx.dependent_xref_id = xdep.xref_id AND + dx.master_xref_id = xmas.xref_id AND + xmas.source_id = smas.source_id AND + xdep.source_id = sdep.source_id AND + smas.name like "Refseq%predicted" AND + sdep.name like "EntrezGene" AND + ox.ox_status = "DUMP_OUT" +IEG + + #don't use labels starting with LOC + +$ignore{'LOC_prefix'} =<<LOCP; +SELECT object_xref_id + FROM object_xref JOIN xref USING(xref_id) JOIN source USING(source_id) + WHERE ox_status = 'DUMP_OUT' AND label REGEXP '^LOC[[:digit:]]+' +LOCP + + return [\@list,\%ignore]; + +} + + +sub gene_description_sources { + + return ("miRBase", + "RFAM", + "CGNC", + "Uniprot/SWISSPROT", + "Uniprot/Varsplic", + "RefSeq_peptide", + "RefSeq_mRNA", + "Uniprot/SPTREMBL" ); + +} + + +1; diff --git a/misc-scripts/xref_mapping/XrefParser/CGNCParser.pm b/misc-scripts/xref_mapping/XrefParser/CGNCParser.pm new file mode 100644 index 0000000000..818b28806a --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/CGNCParser.pm @@ -0,0 +1,65 @@ +package XrefParser::CGNCParser; + +use strict; +use warnings; +use Carp; +use DBI; + +use base qw(XrefParser::BaseParser); + +sub run { + + my ($self, $ref_arg) = @_; + my $source_id = $ref_arg->{source_id}; + my $species_id = $ref_arg->{species_id}; + my $files = $ref_arg->{files}; + my $verbose = $ref_arg->{verbose}; + + if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){ + croak "Need to pass source_id, species_id and files as pairs"; + } + $verbose |=0; + + my $file = @{$files}[0]; + + my $file_io = $self->get_filehandle($file); + if ( !defined $file_io ) { + print STDERR "ERROR: Could not open $file\n"; + return 1; # 1 is an error + } + + $source_id = $self->get_source_id_for_source_name("CGNC"); + + + my $count = 0; + while ( $_ = $file_io->getline() ) { +#48941 ENSGALG00000002652 FZD10 + + chomp; + my @array = split /\t/x, $_; + + my $ensid = $array[1]; + my $acc = $array[2]; + my $desc = $array[3]; + + if($ensid =~ /ENSGAL/){ + my $xref_id = $self->add_xref({ acc => $acc, + version => 0, + label => $acc, + desc => $desc, + source_id => $source_id, + species_id => $species_id, + info_type => "DIRECT"} ); + + $self->add_direct_xref( $xref_id, $ensid, "Gene", ''); + $count++; + } else{ + print STDERR "No match for $acc\n"; + } + } + print "$count direct CGNC xrefs added\n"; + return 0; + +} + +1; diff --git a/misc-scripts/xref_mapping/xref_config.ini b/misc-scripts/xref_mapping/xref_config.ini index 623927544d..d1ec7b52b6 100644 --- a/misc-scripts/xref_mapping/xref_config.ini +++ b/misc-scripts/xref_mapping/xref_config.ini @@ -1129,6 +1129,18 @@ parser = HGNCParser release_uri = data_uri = +[source CGNC::gallus_gallus] +# Used by chicken +name = CGNC +download = Y +order = 30 +priority = 1 +prio_descr = cgnc +parser = CGNCParser +release_uri = +data_uri = script:wget=>http://birdgenenames.org/cgnc/downloads.jsp, + + [source HGNC::homo_sapiens#01] # Used by homo_sapiens name = HGNC @@ -6050,6 +6062,7 @@ source = UniParc::MULTI source = RFAM::MULTI source = miRBase::MULTI source = ArrayExpress::MULTI +source = CGNC::gallus_gallus [species gasterosteus_aculeatus] taxonomy_id = 69293 -- GitLab