From cbeca63af15421ada9820bf3a1435172f92d112b Mon Sep 17 00:00:00 2001 From: Ian Longden <ianl@sanger.ac.uk> Date: Wed, 17 Nov 2004 11:18:19 +0000 Subject: [PATCH] check the species is correct. This will make the parsing a little slower for those species specific files but the same modules can now be used if you have to parse a file with a mixture of species. --- .../xref_mapping/XrefParser/GOParser.pm | 48 ++++++++++--------- .../XrefParser/RefSeqGPFFParser.pm | 22 +++++---- .../xref_mapping/XrefParser/RefSeqParser.pm | 20 +++++--- 3 files changed, 53 insertions(+), 37 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefParser/GOParser.pm b/misc-scripts/xref_mapping/XrefParser/GOParser.pm index 1739761ae5..1874f84698 100644 --- a/misc-scripts/xref_mapping/XrefParser/GOParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/GOParser.pm @@ -47,31 +47,35 @@ sub run { open(GO,"<".$file) || die "Could not open $file\n"; + my $taxon_line = "taxon:".$species_id; + while (<GO>) { - chomp; - my @array = split (/\t/,$_); - $array[9] =~ s/\'/\\\'/g; - my $master=0; - if($array[0] =~ /ENSEMBL/){ - #these might be good for a check - # match GO to Uniprot - # match Uniprot to ENSEMBL - # check ENSEMBL's are the same. - } - elsif($array[0] =~ /RefSeq/){ - if($refseq{$array[1]}){ - XrefParser::BaseParser->add_to_xrefs($refseq{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id); - $count++; + if(/$taxon_line/){ + chomp; + my @array = split (/\t/,$_); + $array[9] =~ s/\'/\\\'/g; + my $master=0; + if($array[0] =~ /ENSEMBL/){ + #these might be good for a check + # match GO to Uniprot + # match Uniprot to ENSEMBL + # check ENSEMBL's are the same. } - } - elsif($array[0] =~ /UniProt/){ - if($swiss{$array[1]}){ - XrefParser::BaseParser->add_to_xrefs($swiss{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id); - $count++; + elsif($array[0] =~ /RefSeq/){ + if($refseq{$array[1]}){ + XrefParser::BaseParser->add_to_xrefs($refseq{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id); + $count++; + } + } + elsif($array[0] =~ /UniProt/){ + if($swiss{$array[1]}){ + XrefParser::BaseParser->add_to_xrefs($swiss{$array[1]},$array[4],'',$array[4],'',$array[6],$source_id,$species_id); + $count++; + } + } + else{ + print STDERR "unknown type ".$array[0]."\n"; } - } - else{ - print STDERR "unknown type ".$array[0]."\n"; } } print "\t$count GO dependent xrefs added\n"; diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm index 0d4f0685a9..a2db83927d 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm @@ -17,7 +17,7 @@ use vars qw(@ISA); if (!defined(caller())) { if (scalar(@ARGV) != 1) { - print "\nUsage: RefSeqGPFFParser.pm file.SPC\n\n"; + print "\nUsage: RefSeqGPFFParser.pm file.SPC <source_id>\n\n"; exit(1); } @@ -32,13 +32,16 @@ sub run { my $self = shift if (defined(caller(1))); my $file = shift; my $source_id = shift; + my $species_id = shift; if ($source_id < 1) { $source_id = XrefParser::BaseParser->get_source_id_for_filename(basename($file)); - print "Source id for $file: $source_id\n"; + } + if(!defined($species_id)){ + $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); } - XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file)); + XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file, $species_id)); } @@ -51,7 +54,7 @@ sub run { sub create_xrefs { - my ($source_id, $file) = @_; + my ($source_id, $file, $species_id) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); @@ -76,10 +79,10 @@ sub create_xrefs { $species =~ s/^\s*//g; $species =~ s/\s+/_/g; $species =~ s/\n//g; - my $species_id = $name2species_id{$species}; - + my $species_id_check = $name2species_id{$species}; + # skip xrefs for species that aren't in the species table - if (defined $species_id) { + if (defined ($species_id) and $species_id = $species_id_check) { my ($acc) = $entry =~ /ACCESSION\s+(\S+)/; my ($ver) = $entry =~ /VERSION\s+(\S+)/; @@ -163,7 +166,10 @@ sub create_xrefs { } push @xrefs, $xref; - } # if defined species + }# if defined species + else{ #### REMOVE after TESTING + print "not correct $species $species_id NE $species_id_check\n"; + } } # while <REFSEQ> diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm index a617465349..aa41c15743 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqParser.pm @@ -17,7 +17,7 @@ use vars qw(@ISA); if (!defined(caller())) { if (scalar(@ARGV) != 1) { - print "\nUsage: RefSeqParser.pm file.SPC\n\n"; + print "\nUsage: RefSeqParser.pm file.SPC <source_id> <species_id>\n\n"; exit(1); } @@ -32,13 +32,16 @@ sub run { my $self = shift if (defined(caller(1))); my $file = shift; my $source_id = shift; + my $species_id = shift; - if ($source_id < 1) { + if (!defined($source_id) or $source_id < 1) { $source_id = XrefParser::BaseParser->get_source_id_for_filename(basename($file)); - print "Source id for $file: $source_id\n"; + } + if(!defined($species_id)){ + $species_id = XrefParser::BaseParser->get_species_id_for_filename($file); } - XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file)); + XrefParser::BaseParser->upload_xrefs(create_xrefs($source_id, $file, $species_id)); } @@ -51,7 +54,7 @@ sub run { sub create_xrefs { - my ($source_id, $file) = @_; + my ($source_id, $file, $species_id) = @_; my %name2species_id = XrefParser::BaseParser->name2species_id(); @@ -92,10 +95,10 @@ sub create_xrefs { $species = lc $species; $species =~ s/ /_/; - my $species_id = $name2species_id{$species}; + my $species_id_check = $name2species_id{$species}; # skip xrefs for species that aren't in the species table - if (defined $species_id) { + if (defined($species_id) and $species_id = $species_id_check) { my ($acc_no_ver,$ver) = split (/\./,$acc); $xref->{ACCESSION} = $acc_no_ver; @@ -111,6 +114,9 @@ sub create_xrefs { push @xrefs, $xref; } + else{ + print "not correct species $species_id NE $species_id_check\n"; + } } -- GitLab