From d66449b655140a7324e342ef1f0f4baec2f23a91 Mon Sep 17 00:00:00 2001
From: Wojtek Bazant <wojtek.bazant@sanger.ac.uk>
Date: Fri, 7 Sep 2018 09:00:19 +0100
Subject: [PATCH] C. elegans references use WormBase mapping to INSDC protein
 ids

- maintain naming convention: WormBase specific stuff says Wormbase at the front
- rewrite WormBaseDirectParser
- WormBaseDirectParser populates protein_ids
- superclass method to make dependent protein_ids as parent
- tap into UniProtParser
  + also skip EMBL scaffold ids (we can't reliably assign them)
- tap into RefSeqGPFFParser
  + extract a method
- tests for new stuff
  + add %args to parametrise test_parser

Benefits for RefSeqGPFFParser:
RefSeq proteins have coordinates as part of their identity, so we
can't reliably sequence match them, we will also pick up all paralogs.
This change fixes this spurious mapping.
Benefits for UniProtParser:
Not the above: UniProt entries are not tied to coordinates so all
paralogs map to the same entry. We can handle versioning and updates
a bit better: if WormBase updates an entry and a protein id changes but
UniProt doesn't reflect this yet, with the change we will still pick up
the UniProt entry although we can't sequence match any more.
---
 .../XrefParser/RefSeqGPFFParser.pm            |  51 +++--
 .../XrefParser/WormbaseCElegansBase.pm        |  44 ++++
 .../WormbaseCElegansRefSeqGPFFParser.pm       |  49 +++++
 .../WormbaseCElegansUniProtParser.pm          |  43 ++++
 .../XrefParser/WormbaseDirectParser.pm        | 186 ++++++-----------
 modules/t/xref_parser.t                       | 195 ++++++++++++++++--
 6 files changed, 419 insertions(+), 149 deletions(-)
 create mode 100644 misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm
 create mode 100644 misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm
 create mode 100644 misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm

diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
index 71d793818a..299cb52e22 100644
--- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
@@ -185,9 +185,40 @@ sub create_xrefs {
 
   while ( $_ = $refseq_io->getline() ) {
 
-    my $xref;
+    my $xref = $self->xref_from_record(
+      $_,
+      \%name2species_id, \%taxonomy2species_id, 
+      $pred_mrna_source_id, $pred_ncrna_source_id,
+      $mrna_source_id, $ncrna_source_id,
+      $pred_peptide_source_id, $peptide_source_id,
+      $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth,
+      $species_id, $type, \%refseq_ids,\%entrez_ids,\%wiki_ids
+     );
+
+      push @xrefs, $xref if $xref;
 
-    my $entry = $_;
+  } # while <REFSEQ>
+
+  $refseq_io->close();
+
+  print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose);
+
+  return \@xrefs;
+
+}
+sub xref_from_record {
+    my ( $self, $entry, $name2species_id, $taxonomy2species_id,
+      $pred_mrna_source_id, $pred_ncrna_source_id,
+      $mrna_source_id, $ncrna_source_id,
+      $pred_peptide_source_id, $peptide_source_id,
+      $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth,
+      $species_id, $type, $refseq_ids,$entrez_ids,$wiki_ids
+) = @_;
+    my %name2species_id = %$name2species_id;
+    my %taxonomy2species_id = %$taxonomy2species_id;
+    my %refseq_ids = %$refseq_ids;
+    my %entrez_ids = %$entrez_ids;
+    my %wiki_ids = %$wiki_ids;
     chomp $entry;
 
     my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/;
@@ -209,6 +240,7 @@ sub create_xrefs {
         && defined $species_id_check
         && $species_id == $species_id_check )
     {
+      my $xref = {};
       my ($acc) = $entry =~ /ACCESSION\s+(\S+)/;
       my ($ver) = $entry =~ /VERSION\s+(\S+)/;
       my ($refseq_pair) = $entry =~ /DBSOURCE\s+REFSEQ: accession (\S+)/;
@@ -328,19 +360,8 @@ sub create_xrefs {
       # Don't add SGD Xrefs, as they are mapped directly from SGD ftp site
 
       # Refseq's do not tell whether the mim is for the gene of morbid so ignore for now.
-
-      push @xrefs, $xref;
-
-    }# if defined species
-
-  } # while <REFSEQ>
-
-  $refseq_io->close();
-
-  print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose);
-
-  return \@xrefs;
-
+      return $xref;
+  }
 }
 
 # --------------------------------------------------------------------------------
diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm
new file mode 100644
index 0000000000..8339c51a6a
--- /dev/null
+++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm
@@ -0,0 +1,44 @@
+=head1 LICENSE
+
+Copyright [2018] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=cut
+
+package XrefParser::WormbaseCElegansBase;
+
+sub swap_dependency {
+  my ($self, $source_id, $dbi, $xref, @source_ids_skip) = @_;
+
+  my @matching_source_id_dependents;
+  my @other_dependents;
+  for my $dependent_xref (@{$xref->{DEPENDENT_XREFS} || []}){
+     my $source_id_here = $dependent_xref->{SOURCE_ID};
+     if($source_id_here eq $source_id
+         and $self->get_xref($dependent_xref->{ACCESSION}, $dependent_xref->{SOURCE_ID}, $xref->{SPECIES_ID})){
+         $dependent_xref->{SPECIES_ID} = $xref->{SPECIES_ID};
+         push @matching_source_id_dependents, $dependent_xref;
+     } elsif (grep {$_ == $source_id_here} @source_ids_skip){
+       #skip
+     } else {
+         push @other_dependents, $dependent_xref;
+     }
+  }
+  return map {{%$_, LABEL=>undef, INFO_TYPE => "MISC", DEPENDENT_XREFS => [{
+        %$xref,
+        INFO_TYPE => "DEPENDENT",
+        LINKAGE_SOURCE_ID => $source_id,
+     }, map {{%$_,INFO_TYPE => "DEPENDENT", LINKAGE_SOURCE_ID => $source_id}} @other_dependents]}} @matching_source_id_dependents;
+}
+1;
diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm
new file mode 100644
index 0000000000..7cae198f10
--- /dev/null
+++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm
@@ -0,0 +1,49 @@
+=head1 LICENSE
+
+Copyright [2018] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=cut
+
+package XrefParser::WormbaseCElegansRefSeqGPFFParser;
+
+use parent XrefParser::WormbaseCElegansBase, XrefParser::RefSeqGPFFParser;
+
+my $source_id;
+
+sub upload_xref_object_graphs {
+  my ($self, $xrefs, $dbi) = @_;
+  $source_id //= $self->get_source_id_for_source_name('protein_id'); 
+  my @adapted_xrefs;
+  for my $xref ( @$xrefs) {
+    push @adapted_xrefs, $self->swap_dependency($source_id, $dbi, $xref);
+  }  
+  return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi);
+}
+sub xref_from_record {
+   my ($self, $entry, @args) = @_;
+   
+   my $xref = $self->SUPER::xref_from_record($entry, @args);
+   $source_id //= $self->get_source_id_for_source_name('protein_id'); 
+   $entry =~ /This record has been curated by WormBase. The\s+reference sequence is identical to (.*?)\./;
+   my $insdc_protein_id = $1;
+   if($insdc_protein_id) {
+     $xref->{DEPENDENT_XREFS} //= [];
+     push @{$xref->{DEPENDENT_XREFS}}, {ACCESSION => $insdc_protein_id, SOURCE_ID=>$source_id};
+     return $xref;
+   } else {
+     return undef;
+   }
+}
+1;
diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm
new file mode 100644
index 0000000000..0652bbc6ea
--- /dev/null
+++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm
@@ -0,0 +1,43 @@
+=head1 LICENSE
+
+Copyright [2018] EMBL-European Bioinformatics Institute
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+=cut
+
+package XrefParser::WormbaseCElegansUniProtParser;
+
+# UniProt xrefs are sometimes - really - dependent xrefs of
+# INSDC entries which we get from somewhere else
+# Attempt to find the parent (has to already be present in the xref table)
+# INSDC and UniProt entries have the same protein sequence, and
+# UniProt lists INSDC as a parent. We get INSDC entries from somewhere else,
+# so make UniProt entries dependent on INSDC entries.
+# Note:
+# INSDC entries have coordinates, and UniProt entries don't.
+# So for perfect homologs, there can be many INSDC entries per UniProt.
+
+use parent XrefParser::WormbaseCElegansBase, XrefParser::UniProtParser;
+
+sub upload_xref_object_graphs {
+  my ($self, $xrefs, $dbi) = @_;
+  my $source_id = $self->get_source_id_for_source_name('protein_id'); 
+  my $source_id_skip = $self->get_source_id_for_source_name('EMBL'); 
+  my @adapted_xrefs;
+  for my $xref ( @$xrefs) {
+    push @adapted_xrefs, $self->swap_dependency($source_id, $dbi, $xref, $source_id_skip);
+  }  
+  return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi);
+}
+1;
diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm
index 0d570d862f..92ea687e51 100644
--- a/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm
@@ -28,34 +28,56 @@ use XrefParser::BaseParser;
 
 use base qw( XrefParser::BaseParser );
 
-
 sub run {
 
   my ($self, $ref_arg) = @_;
   my $source_id    = $ref_arg->{source_id};
   my $species_id   = $ref_arg->{species_id};
   my $files        = $ref_arg->{files};
-  my $verbose      = $ref_arg->{verbose};
 
   if((!defined $source_id) or (!defined $species_id) or (!defined $files)){
     croak "Need to pass source_id, species_id and files as pairs";
   }
-  $verbose |=0;
 
   my $file = @{$files}[0];
-
-  my $wormbasegene_src_id = $self->get_source_id_for_source_name('wormbase_gene');
-  my $wormbasegseq_src_id = $self->get_source_id_for_source_name('wormbase_gseqname');
-  my $wormbaselocus_src_id = $self->get_source_id_for_source_name('wormbase_locus');
-  my $wormbasetran_src_id = $self->get_source_id_for_source_name('wormbase_transcript');
-  my $wormpep_src_id = $self->get_source_id_for_source_name('wormpep_id');
-
-  my $xref_wgene_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasegene_src_id AND species_id=$species_id");
-  my $xref_gseq_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasegseq_src_id AND species_id=$species_id");
-  my $xref_wloc_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbaselocus_src_id AND species_id=$species_id");
-  my $xref_wtran_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasetran_src_id AND species_id=$species_id");
-  my $xref_wpep_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormpep_src_id AND species_id=$species_id");
-
+  my @fields = qw/wormbase_gene wormbase_gseqname wormbase_locus wormbase_transcript wormpep_id protein_id/;
+  my %src_ids;
+  my $sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=? AND species_id=$species_id");
+  for my $field (@fields){
+    $src_ids{$field} = $self->get_source_id_for_source_name($field); 
+  }
+  my $data = $self->get_data(@$files);
+  for my $gene_id (keys %$data){
+    $self->add_xref_and_direct_xref(
+      $sth, $species_id, "gene", $src_ids{wormbase_gene},
+      $gene_id,  $gene_id
+    );
+    $self->add_xref_and_direct_xref(
+      $sth, $species_id, "gene", $src_ids{wormbase_gseqname},
+      $gene_id, $data->{$gene_id}->{wormbase_gseqname}
+    );
+    $self->add_xref_and_direct_xref(
+      $sth, $species_id, "gene", $src_ids{wormbase_locus}, 
+      $gene_id, $data->{$gene_id}->{wormbase_locus}
+    );
+    for my $transcript (@{$data->{$gene_id}->{transcripts}}){
+      $self->add_xref_and_direct_xref(
+        $sth, $species_id, "transcript", $src_ids{wormbase_transcript}, 
+        $transcript->{transcript_id}, $transcript->{transcript_id}
+      );
+      $self->add_xref_and_direct_xref(
+        $sth, $species_id, "translation", $src_ids{wormpep_id}, 
+        $transcript->{wormpep_id}, $transcript->{wormpep_id}, $transcript->{transcript_id}
+      );
+      $self->add_xref_and_direct_xref(
+        $sth, $species_id, "translation", $src_ids{protein_id}, 
+        $transcript->{protein_id}, $transcript->{protein_id}, $transcript->{transcript_id}
+      );
+    } 
+  }
+}
+sub get_data {
+  my ($self, $file) = @_;
   my $pep_io = $self->get_filehandle($file);
 
   if ( !defined $pep_io ) {
@@ -63,116 +85,38 @@ sub run {
     return 1;    # 1 error
   }
 
-  my ($x_count, $d_count);
-
-  my (%wbgene2seqid, %wbgene2loc, %tran2wbtran, %tran2wpep);
+  my $data = {};
 
   while ( $_ = $pep_io->getline() ) {
     next if /^\/\//;
-    
-    my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep) = split(/\t/, $_);
-
-    # Each WBGeneid should have only one sequence name and (optionally) one locus name
-    $wbgene2seqid{$wbgeneid} = $gseqid;
-    $wbgene2loc{$wbgeneid} = $locus if $locus ne '.';
-
-    $tran2wbtran{$wbtranscript} = 1;
-    $tran2wpep{$wbtranscript} = $wormpep if $wormpep ne '.';
-
+    my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep, $insdc_parent, $insdc_locus_tag, $protein_id, $uniprot_id) = split(/\t/, $_);
+    $data->{$wbgeneid}->{transcripts} //=[];
+    push @{$data->{$wbgeneid}->{transcripts}}, {
+      transcript_id => $wbtranscript,
+      ($wormpep eq '.' ? () : (wormpep_id => $wormpep)),
+      ($protein_id eq '.' ? () : (protein_id => $protein_id)),
+    };
+    $data->{$wbgeneid}->{wormbase_gseqname} = $gseqid;
+    $data->{$wbgeneid}->{wormbase_locus} =  $locus if $locus ne '.'; 
   }
   $pep_io->close();
-
-  foreach my $wbgid (keys %wbgene2seqid) {
-    # reuse or create xref
-    $xref_wgene_sth->execute($wbgid);
-    my $xref_id = ($xref_wgene_sth->fetchrow_array())[0];
-    if (!$xref_id) {
-      $xref_id = $self->add_xref({ acc        => $wbgid,
-                                   label      => $wbgid,
-                                   source_id  => $wormbasegene_src_id,
-                                   species_id => $species_id,
-                                   info_type  => "DIRECT"} );
-      $x_count++;
-    }
-    $self->add_direct_xref($xref_id, $wbgid, "gene", "");
-    $d_count++;
-    
-    my $gseqname = $wbgene2seqid{$wbgid};
-
-    $xref_gseq_sth->execute($wbgid);
-    $xref_id = ($xref_gseq_sth->fetchrow_array())[0];
-    if (not $xref_id) {
-      $xref_id = $self->add_xref({ acc        => $wbgid,
-                                   label      => $gseqname,
-                                   source_id  => $wormbasegseq_src_id,
-                                   species_id => $species_id,
-                                   info_type  => "DIRECT"} );
-      $x_count++;
-    }
-    $self->add_direct_xref($xref_id, $wbgid, "gene", "");
-    $d_count++;
-
-
-    if (exists $wbgene2loc{$wbgid}) {
-      my $loc_sym = $wbgene2loc{$wbgid};
-
-      $xref_wloc_sth->execute($wbgid);    
-      $xref_id = ($xref_wloc_sth->fetchrow_array())[0];
-      if (!$xref_id) {
-        $xref_id = $self->add_xref({ acc        => $wbgid,
-                                     label      => $loc_sym,
-                                     source_id  => $wormbaselocus_src_id,
-                                     species_id => $species_id,
-                                     info_type  => "DIRECT"} );
-        $x_count++;
-      }
-    }
-    
-    # and direct xref
-    $self->add_direct_xref($xref_id, $wbgid, "gene", "");
-    $d_count++;
-  }
-  
-
-  foreach my $tid (keys %tran2wbtran) {
-    $xref_wtran_sth->execute($tid);      
-    my $xref_id = ($xref_wtran_sth->fetchrow_array())[0];
-    if (!$xref_id) {
-      $xref_id = $self->add_xref({ acc        => $tid,
-                                   label      => $tid,
-                                   source_id  => $wormbasetran_src_id,
-                                   species_id => $species_id,
-                                   info_type  => "DIRECT"} );
-      $x_count++;
-    }
-    
-    # and direct xref
-    $self->add_direct_xref($xref_id, $tid, "transcript", "");
-    $d_count++;
-  }
-
-  foreach my $tid (keys %tran2wpep) {
-    my $wpep = $tran2wpep{$tid};
-
-    $xref_wpep_sth->execute($wpep);
-      
-    my $xref_id = ($xref_wpep_sth->fetchrow_array())[0];
-    if (!$xref_id) {
-      $xref_id = $self->add_xref({ acc        => $wpep,
-                                   label      => $wpep,
-                                   source_id  => $wormpep_src_id,
-                                   species_id => $species_id,
-                                   info_type  => "DIRECT"} );
-      $x_count++;
-    }
-
-    # and direct xref
-    $self->add_direct_xref($xref_id, $tid, "translation", "");
-    $d_count++;
-  }
-
-  print "Added $d_count direct xrefs and $x_count xrefs\n" if($verbose);
-  return 0;
+  return $data;
 }
 
+sub add_xref_and_direct_xref {
+  my ($self, $sth, $species_id, $object_type, $source_id,  $object_id, $label, $primary_id) = @_;
+  $primary_id //= $object_id;
+  return unless $label;
+  $sth->execute($primary_id, $source_id);
+  $self->add_direct_xref(
+      ($sth->fetchrow_array())[0]
+      || $self->add_xref({ 
+           acc => $object_id, 
+           label => $label, 
+           source_id => $source_id, 
+           species_id => $species_id,
+           info_type  => "DIRECT"
+      })
+  , $primary_id, $object_type, "");
+}
 1;
diff --git a/modules/t/xref_parser.t b/modules/t/xref_parser.t
index 2c77b54ea7..95ad36a127 100644
--- a/modules/t/xref_parser.t
+++ b/modules/t/xref_parser.t
@@ -55,32 +55,37 @@ my %xref_tables_expected_empty_by_default = (
 );
 my $tmp_dir = tempdir(CLEANUP=>1);
 sub store_in_temporary_file {
-  my $path = "$tmp_dir/tmp";
+  my ($content, %opts) = @_;
+  my $path = join("/", $tmp_dir, $opts{tmp_file_name} || "tmp");
   open(my $fh, ">", $path) or die $path;  
-  print $fh @_;
+  print $fh $content;
   close($fh);
   return $path;
 }
+# Happens to match the species id of the core database
+my $SPECIES_ID = 1;
+my $SPECIES_NAME = "Homo sapiens";
 sub test_parser {
-  my ($parser, $content, $source_id, $expected, $test_name) = @_;
+  my ($parser, $content, $expected, $test_name, %opts) = @_;
   require_ok($parser);
   $parser->new($database)->run({
-   files => [store_in_temporary_file($content)],
-   source_id => $source_id,
-   species_id => 1 #Happens to be right, but doesn't matter anyway - we are not testing the mapping
+   files => [store_in_temporary_file($content, %opts)],
+   source_id => "Source id (unused but sometimes required)",
+   species_id => $SPECIES_ID,
+   species => $SPECIES_NAME,
   });
   my $expected_table_counts = {%xref_tables_expected_empty_by_default, %$expected};
   subtest "$parser $test_name" => sub {
     plan tests => scalar(keys %$expected_table_counts);
     for my $table (keys %$expected_table_counts){
       my $actual_count = count_rows($dba, $table);
-      $dba->dbc->prepare("delete from $table;")->execute() if $actual_count;
+      $dba->dbc->prepare("delete from $table;")->execute() if ($actual_count and not $opts{skip_clean});
       my $expected_count = $expected_table_counts->{$table};
       is($actual_count, $expected_count, "$table has $expected_count rows") or diag "$table has $actual_count rows";
     }
   }
 }
-test_parser("XrefParser::WormbaseDirectParser", "", "source_id (unused)", {}, "null case");
+test_parser("XrefParser::WormbaseDirectParser", "",  {}, "null case");
 my $wormbase_celegans_xrefs_head= <<EOF;
 //
 // WormBase Caenorhabditis elegans XREFs for WS265
@@ -102,11 +107,175 @@ my $wormbase_celegans_xrefs_head= <<EOF;
 2L52.1	WBGene00007063	.	2L52.1a	CE32090	BX284602	CELE_2L52.1	CCD61130	A4F336
 2L52.2	WBGene00200402	.	2L52.2	.	BX284602	CELE_2L52.2	.	.
 EOF
-test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, "source_id (unused)", {
-xref=>9,
-gene_direct_xref => 6,
+my $wormbase_celegans_xrefs_expected_count = {
+xref=>11,
+gene_direct_xref => 4,
 transcript_direct_xref => 3,
-translation_direct_xref => 2,
-}, "Direct xrefs: genes: count currently off due to some questionable duplicates, transcripts: as in column 4, translations: as in column 5. At least one direct xref per xref (but should be one to one)");
+translation_direct_xref => 4,
+};
+test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
+   $wormbase_celegans_xrefs_expected_count, "Direct xrefs: genes: columns 1,2,3, transcripts: column 4, translations: column 5 and 7. xrefs: sum of these "
+);
+
+my $uniprot_elegans_record = <<EOF;
+ID   A0A0K3AWR5_CAEEL        Unreviewed;       220 AA.
+AC   A0A0K3AWR5;
+DT   11-NOV-2015, integrated into UniProtKB/TrEMBL.
+DT   11-NOV-2015, sequence version 1.
+DT   18-JUL-2018, entry version 14.
+DE   SubName: Full=Uncharacterized protein {ECO:0000313|EMBL:CTQ86426.1};
+GN   ORFNames=2L52.1 {ECO:0000313|EMBL:CTQ86426.1,
+GN   ECO:0000313|WormBase:2L52.1b},
+GN   CELE_2L52.1 {ECO:0000313|EMBL:CTQ86426.1};
+OS   $SPECIES_NAME.
+OC   Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida;
+OC   Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis.
+OX   NCBI_TaxID=$SPECIES_ID {ECO:0000313|EMBL:CTQ86426.1, ECO:0000313|Proteomes:UP000001940};
+RN   [1] {ECO:0000313|EMBL:CTQ86426.1, ECO:0000313|Proteomes:UP000001940}
+RP   NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].
+RC   STRAIN=Bristol N2 {ECO:0000313|EMBL:CTQ86426.1,
+RC   ECO:0000313|Proteomes:UP000001940};
+RX   PubMed=9851916; DOI=https://doi.org/10.1126/science.282.5396.2012;
+RG   The C. elegans sequencing consortium;
+RA   Sulson J.E., Waterston R.;
+RT   "Genome sequence of the nematode C. elegans: a platform for
+RT   investigating biology.";
+RL   Science 282:2012-2018(1998).
+CC   -----------------------------------------------------------------------
+CC   Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms
+CC   Distributed under the Creative Commons Attribution (CC BY 4.0) License
+CC   -----------------------------------------------------------------------
+DR   EMBL; BX284602; CTQ86426.1; -; Genomic_DNA.
+DR   RefSeq; NP_001300487.1; NM_001313558.1.
+DR   UniGene; Cel.25279; -.
+DR   EnsemblMetazoa; 2L52.1b; 2L52.1b; WBGene00007063.
+DR   GeneID; 181792; -.
+DR   CTD; 181792; -.
+DR   WormBase; 2L52.1b; CE50569; WBGene00007063; -.
+DR   Proteomes; UP000001940; Chromosome II.
+DR   ExpressionAtlas; A0A0K3AWR5; baseline and differential.
+PE   4: Predicted;
+KW   Complete proteome {ECO:0000313|Proteomes:UP000001940};
+KW   Reference proteome {ECO:0000313|Proteomes:UP000001940}.
+SQ   SEQUENCE   220 AA;  26028 MW;  E12D5EA7F6FFF373 CRC64;
+     MSDNEEVYVN FRGMNCISTG KSASMVPSKR RNWPKRVKKR LSTQRNNQKT IRPPELNKNN
+     IEIKDMNSNN LEERNREECI QPVSVEKNIL HFEKFKSNQI CIVRENNKFR EGTRRRRKNS
+     GESEDLKIHE NFTEKRRPIR SCKQNISFYE MDGDIEEFEV FFDTPTKSKK VLLDIYSAKK
+     MPKIEVEDSL VNKFHSKRPS RACRVLGSME EVPFDVEIGY
+//
+EOF
+test_parser("XrefParser::UniProtParser", $uniprot_elegans_record,  {
+  xref => 3,
+  primary_xref => 1,
+  dependent_xref => 2,
+},"Example UniProt record"); 
+(my $uniprot_elegans_record_embl = $uniprot_elegans_record) =~ s/DR   EMBL;.*?\n//;
+test_parser("XrefParser::UniProtParser", $uniprot_elegans_record_embl,  {
+  xref => 1,
+  primary_xref => 1,
+},"EMBL entries are the dependent xrefs");
+my @recognised_sources = (
+ "PDB; 3HRI; X-ray; 2.85 A; A/B/C/D/E/F=44-477.",
+ "MEROPS; C26.956; -.",
+);
+for my $l (@recognised_sources) {
+  (my $uniprot_elegans_record_extra_line = $uniprot_elegans_record) =~ s/DR(.*?)\n/DR$1\nDR  $l/;
+  test_parser("XrefParser::UniProtParser", $uniprot_elegans_record_extra_line,  {
+    xref => 4,
+    primary_xref => 1,
+    dependent_xref => 3,
+  }, "Pick up as extra xref + dependent xref: $l" );
+} 
+test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record,  {
+}, "No UniProt entries without corresponding INSDC entries");
+
+test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
+  $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
+skip_clean => 1);
+my $wormbase_and_uniprot_expected_count = {
+  %$wormbase_celegans_xrefs_expected_count,
+  xref => $wormbase_celegans_xrefs_expected_count->{xref}+1, 
+  dependent_xref => 1 #protein id still there, no parent sequence ID 
+};
+test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record, 
+  $wormbase_and_uniprot_expected_count, "Get counts");
+
+for my $l (@recognised_sources) {
+  (my $uniprot_elegans_record_extra_line = $uniprot_elegans_record) =~ s/DR(.*?)\n/DR$1\nDR  $l/;
+  test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
+    $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
+  skip_clean => 1);
+  test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record_extra_line,  {
+    %$wormbase_and_uniprot_expected_count,
+    xref => $wormbase_and_uniprot_expected_count->{xref}+1,
+    dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1,
+  }, "Pick up as extra xref + dependent xref: $l"  );
+}
+my $refseq_protein_elegans_record = <<EOF;
+LOCUS       NP_493629                427 aa            linear   INV 19-AUG-2018
+DEFINITION  Uncharacterized protein CELE_2L52.1 [Caenorhabditis elegans].
+ACCESSION   NP_493629
+VERSION     NP_493629.2
+DBLINK      BioProject: PRJNA158
+            BioSample: SAMEA3138177
+DBSOURCE    REFSEQ: accession NM_061228.2
+KEYWORDS    RefSeq.
+SOURCE      Caenorhabditis elegans
+  ORGANISM  Caenorhabditis elegans
+            Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida;
+            Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis.
+REFERENCE   
+  <snipped>
+COMMENT     REVIEWED REFSEQ: This record has been curated by WormBase. The
+            reference sequence is identical to CCD61130.
+FEATURES             Location/Qualifiers
+     source          1..427
+                     /organism="Caenorhabditis elegans"
+                     /strain="Bristol N2"
+                     /db_xref="taxon:$SPECIES_ID"
+                     /chromosome="II"
+     Protein         1..427
+                     /product="hypothetical protein"
+                     /calculated_mol_wt=49887
+     CDS             1..427
+                     /gene="2L52.1"
+                     /locus_tag="CELE_2L52.1"
+                     /standard_name="2L52.1a"
+                     /coded_by="NM_061228.2:1..1284"
+                     /note="Confirmed by transcript evidence"
+                     /db_xref="EnsemblGenomes-Gn:WBGene00007063"
+                     /db_xref="EnsemblGenomes-Tr:2L52.1a"
+                     /db_xref="GeneID:181792"
+                     /db_xref="GOA:A4F336"
+                     /db_xref="InterPro:IPR013087"
+                     /db_xref="UniProtKB/TrEMBL:A4F336"
+                     /db_xref="WormBase:WBGene00007063"
+ORIGIN      
+        1 msmvrnvsnq sekleilsck wvgclkstev fktveklldh vtadhipevi vnddgseevv
+       61 cqwdccemga srgnlqkkke wmenhfktrh vrkakifkcl iedcpvvkss sqeiethlri
+      121 shpinpkker lkefksstdh ieptqanrvw tivngevqwk tpprvkkktv iyyddgpryv
+      181 fptgcarcny dsdeselesd efwsatemsd neevyvnfrg mncistgksa smvpskrrnw
+      241 pkrvkkrlst qrnnqktirp pelnknniei kdmnsnnlee rnreeciqpv sveknilhfe
+      301 kfksnqiciv rennkfregt rrrrknsges edlkihenft ekrrpirsck qnisfyemdg
+      361 dieefevffd tptkskkvll diysakkmpk ievedslvnk fhskrpsrac rvlgsmeevp
+      421 fdveigy
+//
+EOF
+test_parser("XrefParser::RefSeqGPFFParser",$refseq_protein_elegans_record, {
+  xref =>1,
+  primary_xref => 1,
+}, "Example RefSeq protein record" , tmp_file_name => "something_that_says_protein");
+test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record, {
+}, "No entries without WormBase records" , tmp_file_name => "something_that_says_protein");
+
+test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
+    $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
+skip_clean => 1);
+test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record,  {
+  %$wormbase_celegans_xrefs_expected_count,
+  xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
+  dependent_xref => 1,
+}, "RefSeq entries hang off INSDC entries", tmp_file_name => "something_that_says_protein");
 done_testing();
 
+
-- 
GitLab