From cbeca63af15421ada9820bf3a1435172f92d112b Mon Sep 17 00:00:00 2001
From: Ian Longden <ianl@sanger.ac.uk>
Date: Wed, 17 Nov 2004 11:18:19 +0000
Subject: [PATCH] check the species is correct. This will make the parsing a
 little slower for those species specific files but the same modules can now
 be used if you have to parse a file with a mixture of species.

---
 .../xref_mapping/XrefParser/GOParser.pm       | 48 ++++++++++---------
 .../XrefParser/RefSeqGPFFParser.pm            | 22 +++++----
 .../xref_mapping/XrefParser/RefSeqParser.pm   | 20 +++++---
 3 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/misc-scripts/xref_mapping/XrefParser/GOParser.pm b/misc-scripts/xref_mapping/XrefParser/GOParser.pm
index 1739761ae5..1874f84698 100644
--- a/misc-scripts/xref_mapping/XrefParser/GOParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/GOParser.pm
@@ -47,31 +47,35 @@ sub run {
 
   open(GO,"<".$file) || die "Could not open $file\n";
 
+  my $taxon_line = "taxon:".$species_id;
+
   while (<GO>) {
-    chomp;
-    my @array = split (/\t/,$_);
-    $array[9] =~ s/\'/\\\'/g;
-    my $master=0;
-    if($array[0] =~ /ENSEMBL/){
-      #these might be good for a check
-      # match GO to Uniprot
-      # match Uniprot to ENSEMBL
-      # check ENSEMBL's are the same.
-    }
-    elsif($array[0] =~ /RefSeq/){
-      if($refseq{$array[1]}){
-	 XrefParser::BaseParser->add_to_xrefs($refseq{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id);
-	 $count++;
+    if(/$taxon_line/){
+      chomp;
+      my @array = split (/\t/,$_);
+      $array[9] =~ s/\'/\\\'/g;
+      my $master=0;
+      if($array[0] =~ /ENSEMBL/){
+	#these might be good for a check
+	# match GO to Uniprot
+	# match Uniprot to ENSEMBL
+	# check ENSEMBL's are the same.
       }
-    }
-    elsif($array[0] =~ /UniProt/){
-      if($swiss{$array[1]}){
-	XrefParser::BaseParser->add_to_xrefs($swiss{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id);
-	$count++;
+      elsif($array[0] =~ /RefSeq/){
+	if($refseq{$array[1]}){
+	  XrefParser::BaseParser->add_to_xrefs($refseq{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id);
+	  $count++;
+	}
+      }
+      elsif($array[0] =~ /UniProt/){
+	if($swiss{$array[1]}){
+	  XrefParser::BaseParser->add_to_xrefs($swiss{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id);
+	  $count++;
+	}
+      }
+      else{
+	print STDERR "unknown type ".$array[0]."\n";
       }
-    }
-    else{
-      print STDERR "unknown type ".$array[0]."\n";
     }
   }
   print "\t$count GO dependent xrefs added\n"; 
diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
index 0d4f0685a9..a2db83927d 100644
--- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
@@ -17,7 +17,7 @@ use vars qw(@ISA);
 if (!defined(caller())) {
 
   if (scalar(@ARGV) != 1) {
-    print "\nUsage: RefSeqGPFFParser.pm file.SPC\n\n";
+    print "\nUsage: RefSeqGPFFParser.pm file.SPC <source_id>\n\n";
     exit(1);
   }
 
@@ -32,13 +32,16 @@ sub run {
   my $self = shift if (defined(caller(1)));
   my $file = shift;
   my $source_id = shift;
+  my $species_id = shift;
 
   if ($source_id < 1) {
     $source_id =  XrefParser::BaseParser->get_source_id_for_filename(basename($file));
-    print "Source id for $file: $source_id\n";
+  }
+  if(!defined($species_id)){
+    $species_id = XrefParser::BaseParser->get_species_id_for_filename($file);
   }
 
-   XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file));
+   XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file, $species_id));
 
 }
 
@@ -51,7 +54,7 @@ sub run {
 
 sub create_xrefs {
 
-  my ($source_id, $file) = @_;
+  my ($source_id, $file, $species_id) = @_;
 
   my %name2species_id =  XrefParser::BaseParser->name2species_id();
 
@@ -76,10 +79,10 @@ sub create_xrefs {
     $species =~ s/^\s*//g;
     $species =~ s/\s+/_/g;
     $species =~ s/\n//g;
-    my $species_id = $name2species_id{$species};
-
+    my $species_id_check = $name2species_id{$species};
+    
     # skip xrefs for species that aren't in the species table
-    if (defined $species_id) {
+    if (defined ($species_id) and $species_id = $species_id_check) {
       
       my ($acc) = $entry =~ /ACCESSION\s+(\S+)/;
       my ($ver) = $entry =~ /VERSION\s+(\S+)/;
@@ -163,7 +166,10 @@ sub create_xrefs {
       }
       push @xrefs, $xref;
 
-    } # if defined species
+    }# if defined species
+    else{ #### REMOVE after TESTING
+      print "not correct $species $species_id  NE $species_id_check\n";
+    }
 
   } # while <REFSEQ>
 
diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm
index a617465349..aa41c15743 100644
--- a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm
@@ -17,7 +17,7 @@ use vars qw(@ISA);
 if (!defined(caller())) {
 
   if (scalar(@ARGV) != 1) {
-    print "\nUsage: RefSeqParser.pm file.SPC\n\n";
+    print "\nUsage: RefSeqParser.pm file.SPC <source_id> <species_id>\n\n";
     exit(1);
   }
 
@@ -32,13 +32,16 @@ sub run {
   my $self = shift if (defined(caller(1)));
   my $file = shift;
   my $source_id = shift;
+  my $species_id = shift;
 
-  if ($source_id < 1) {
+  if (!defined($source_id) or $source_id < 1) {
     $source_id = XrefParser::BaseParser->get_source_id_for_filename(basename($file));
-    print "Source id for $file: $source_id\n";
+  }
+  if(!defined($species_id)){
+    $species_id = XrefParser::BaseParser->get_species_id_for_filename($file);
   }
 
-  XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file));
+  XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file, $species_id));
 
 }
 
@@ -51,7 +54,7 @@ sub run {
 
 sub create_xrefs {
 
-  my ($source_id, $file) = @_;
+  my ($source_id, $file, $species_id) = @_;
 
   my %name2species_id = XrefParser::BaseParser->name2species_id();
 
@@ -92,10 +95,10 @@ sub create_xrefs {
     $species = lc $species;
     $species =~ s/ /_/;
 
-    my $species_id = $name2species_id{$species};
+    my $species_id_check = $name2species_id{$species};
 
     # skip xrefs for species that aren't in the species table
-    if (defined $species_id) {
+    if (defined($species_id) and $species_id = $species_id_check) {
 
       my ($acc_no_ver,$ver) = split (/\./,$acc);
       $xref->{ACCESSION} = $acc_no_ver;
@@ -111,6 +114,9 @@ sub create_xrefs {
       push @xrefs, $xref;
 
     }
+    else{
+      print "not correct species $species_id  NE $species_id_check\n";
+    }
 
   }
 
-- 
GitLab