C. elegans specific parsing of RefSeq_dna file

- New xref: to a WormBase CDS feature - Modify WormbaseCElegansRefSeqGPFFParser to serve both kinds of files - extract a utility method from RefSeqGPFFParser - xref_config.ini stanza for wormbase_cds - tests for new functionality

C. elegans specific parsing of RefSeq_dna file
- New xref: to a WormBase CDS feature - Modify WormbaseCElegansRefSeqGPFFParser to serve both kinds of files - extract a utility method from RefSeqGPFFParser - xref_config.ini stanza for wormbase_cds - tests for new functionality
7d6346f7 · Wojtek Bazant · d66449b6 · 7d6346f7 · 7d6346f7 · 7d6346f7
Commit 7d6346f7 authored 6 years ago by Wojtek Bazant
--- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm
@@ -164,24 +164,8 @@ sub create_xrefs {

  local $/ = "\/\/\n";

-  my $type;
-  if ($file =~ /protein/) {
-
-    $type = 'peptide';
-
-  } elsif ($file =~ /rna/) {
-
-    $type = 'dna';
-
-  } elsif($file =~ /RefSeq_protein/){
-
-    $type = 'peptide';
-
-  }else{
-    print STDERR "Could not work out sequence type for $file\n";
-    return;
-  }
-
+  my $type = $self->type_from_file($file);
+  return unless $type;

  while ( $_ = $refseq_io->getline() ) {

@@ -206,6 +190,14 @@ sub create_xrefs {
  return \@xrefs;

 }
+sub type_from_file {
+    my ($self, $file) = @_;
+    return 'peptide' if $file =~ /RefSeq_protein/;
+    return 'dna' if $file =~ /rna/;
+    return 'peptide' if $file =~ /protein/;
+    print STDERR "Could not work out sequence type for $file\n";
+    return undef;
+}
 sub xref_from_record {
    my ( $self, $entry, $name2species_id, $taxonomy2species_id,
      $pred_mrna_source_id, $pred_ncrna_source_id,

--- a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm
+++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm
@@ -19,13 +19,14 @@ limitations under the License.
 package XrefParser::WormbaseCElegansBase;

 sub swap_dependency {
-  my ($self, $source_id, $dbi, $xref, @source_ids_skip) = @_;
+  my ($self, $source_ids, $dbi, $xref, @source_ids_skip) = @_;
+  $source_ids = [$source_ids] unless ref $source_ids eq 'ARRAY';

  my @matching_source_id_dependents;
  my @other_dependents;
  for my $dependent_xref (@{$xref->{DEPENDENT_XREFS} || []}){
     my $source_id_here = $dependent_xref->{SOURCE_ID};
-     if($source_id_here eq $source_id
+     if(grep {$_ == $source_id_here } @$source_ids
         and $self->get_xref($dependent_xref->{ACCESSION}, $dependent_xref->{SOURCE_ID}, $xref->{SPECIES_ID})){
         $dependent_xref->{SPECIES_ID} = $xref->{SPECIES_ID};
         push @matching_source_id_dependents, $dependent_xref;
@@ -35,10 +36,22 @@ sub swap_dependency {
         push @other_dependents, $dependent_xref;
     }
  }
-  return map {{%$_, LABEL=>undef, INFO_TYPE => "MISC", DEPENDENT_XREFS => [{
-        %$xref,
-        INFO_TYPE => "DEPENDENT",
-        LINKAGE_SOURCE_ID => $source_id,
-     }, map {{%$_,INFO_TYPE => "DEPENDENT", LINKAGE_SOURCE_ID => $source_id}} @other_dependents]}} @matching_source_id_dependents;
+  my @result;
+  for my $matching_source_id_dependent (@matching_source_id_dependents) {
+    my $source_id = $matching_source_id_dependent->{SOURCE_ID};
+    my $xref_as_dependent_here = {
+      %$xref, INFO_TYPE => "DEPENDENT",
+      LINKAGE_SOURCE_ID => $source_id,
+      DEPENDENT_XREFS => undef,
+    };
+    my @other_dependents_as_dependents_here = map {{%$_,INFO_TYPE => "DEPENDENT", LINKAGE_SOURCE_ID => $source_id}} @other_dependents;
+    push @result, {
+       %$matching_source_id_dependent,
+       LABEL=>undef, INFO_TYPE => "MISC",
+       DEPENDENT_XREFS => [$xref_as_dependent_here, @other_dependents_as_dependents_here] 
+    };
+  }
+
+  return @result;
 }
 1;
--- a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm
@@ -17,33 +17,51 @@ limitations under the License.
 =cut

 package XrefParser::WormbaseCElegansRefSeqGPFFParser;
-
-use parent XrefParser::WormbaseCElegansBase, XrefParser::RefSeqGPFFParser;
-
-my $source_id;
-
+use strict;
+use parent qw/XrefParser::WormbaseCElegansBase XrefParser::RefSeqGPFFParser/;
+my $SOURCE_IDS;
+my $PATTERN;
+sub run {
+  my ($self, $arg_ref) = @_;
+  my $type = $self->type_from_file(@{$arg_ref->{files}});
+  if($type eq 'peptide'){
+    $SOURCE_IDS = [ $self->get_source_id_for_source_name('protein_id') ];
+    $PATTERN =  qr/This record has been curated by WormBase. The\s+reference sequence is identical to (.*?)\./;
+  } elsif ($type eq 'dna'){
+    $SOURCE_IDS = [
+       $self->get_source_id_for_source_name('wormbase_cds'),
+       $self->get_source_id_for_source_name('wormbase_transcript'),
+    ];
+    $PATTERN = qr/standard_name="(.*?)"/;
+  }
+  die %$arg_ref unless @$SOURCE_IDS;
+  return $self->SUPER::run($arg_ref);
+}
 sub upload_xref_object_graphs {
  my ($self, $xrefs, $dbi) = @_;
-  $source_id //= $self->get_source_id_for_source_name('protein_id'); 
  my @adapted_xrefs;
  for my $xref ( @$xrefs) {
-    push @adapted_xrefs, $self->swap_dependency($source_id, $dbi, $xref);
+    push @adapted_xrefs, $self->swap_dependency($SOURCE_IDS, $dbi, $xref, @$SOURCE_IDS);
  }  
  return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi);
 }
+
 sub xref_from_record {
   my ($self, $entry, @args) = @_;
-   
-   my $xref = $self->SUPER::xref_from_record($entry, @args);
-   $source_id //= $self->get_source_id_for_source_name('protein_id'); 
-   $entry =~ /This record has been curated by WormBase. The\s+reference sequence is identical to (.*?)\./;
-   my $insdc_protein_id = $1;
-   if($insdc_protein_id) {
-     $xref->{DEPENDENT_XREFS} //= [];
-     push @{$xref->{DEPENDENT_XREFS}}, {ACCESSION => $insdc_protein_id, SOURCE_ID=>$source_id};
-     return $xref;
-   } else {
-     return undef;
-   }
+   return &modify_xref_with_dependent(
+      $SOURCE_IDS, $entry,
+      $self->SUPER::xref_from_record($entry, @args),
+      $PATTERN,
+   );
+}
+
+sub modify_xref_with_dependent {
+   my ($source_ids, $entry, $xref, $pattern) = @_;
+   return unless $xref;
+   return unless $entry =~ $pattern;
+   return unless $1;
+   $xref->{DEPENDENT_XREFS} //= [];
+   push @{$xref->{DEPENDENT_XREFS}}, map {{ACCESSION => $1, SOURCE_ID=>$_}} @$source_ids;
+   return $xref;
 }
 1;
--- a/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm
+++ b/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm
@@ -40,7 +40,7 @@ sub run {
  }

  my $file = @{$files}[0];
-  my @fields = qw/wormbase_gene wormbase_gseqname wormbase_locus wormbase_transcript wormpep_id protein_id/;
+  my @fields = qw/wormbase_gene wormbase_gseqname wormbase_locus wormbase_transcript wormbase_cds wormpep_id protein_id/;
  my %src_ids;
  my $sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=? AND species_id=$species_id");
  for my $field (@fields){
@@ -65,6 +65,10 @@ sub run {
        $sth, $species_id, "transcript", $src_ids{wormbase_transcript}, 
        $transcript->{transcript_id}, $transcript->{transcript_id}
      );
+      $self->add_xref_and_direct_xref(
+        $sth, $species_id, "transcript", $src_ids{wormbase_cds},
+        $transcript->{wormbase_cds}, $transcript->{wormbase_cds}, $transcript->{transcript_id}
+      );
      $self->add_xref_and_direct_xref(
        $sth, $species_id, "translation", $src_ids{wormpep_id}, 
        $transcript->{wormpep_id}, $transcript->{wormpep_id}, $transcript->{transcript_id}
@@ -90,11 +94,13 @@ sub get_data {
  while ( $_ = $pep_io->getline() ) {
    next if /^\/\//;
    my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep, $insdc_parent, $insdc_locus_tag, $protein_id, $uniprot_id) = split(/\t/, $_);
+
    $data->{$wbgeneid}->{transcripts} //=[];
    push @{$data->{$wbgeneid}->{transcripts}}, {
      transcript_id => $wbtranscript,
-      ($wormpep eq '.' ? () : (wormpep_id => $wormpep)),
-      ($protein_id eq '.' ? () : (protein_id => $protein_id)),
+      ($wormpep ne '.' && $wbtranscript =~ /^(.*?)(\.\d+)?$/ ? (wormbase_cds => $1 ) : ()),
+      ($wormpep ne '.' ? (wormpep_id => $wormpep) : ()),
+      ($protein_id ne '.' ? (protein_id => $protein_id) : ()),
    };
    $data->{$wbgeneid}->{wormbase_gseqname} = $gseqid;
    $data->{$wbgeneid}->{wormbase_locus} =  $locus if $locus ne '.'; 

--- a/misc-scripts/xref_mapping/xref_config.ini
+++ b/misc-scripts/xref_mapping/xref_config.ini
@@ -1652,6 +1652,14 @@ order           = 50
 priority        = 1
 parser          = comes from WormbaseDirectParser

+[source wormbase_cds::wormbase]
+# Used by wormbase core species
+name            = wormbase_cds
+download        = N
+order           = 50
+priority        = 1
+parser          = comes from WormbaseDirectParser
+
 [source Gramene_Pathway::arabidopsis_thaliana]
 # Used by Arabidopsis thaliana, Gramene-specific
 name            = Gramene_Pathway

--- a/modules/t/xref_parser.t
+++ b/modules/t/xref_parser.t
@@ -69,7 +69,7 @@ sub test_parser {
  my ($parser, $content, $expected, $test_name, %opts) = @_;
  require_ok($parser);
  $parser->new($database)->run({
-   files => [store_in_temporary_file($content, %opts)],
+   files =>[store_in_temporary_file($content, %opts)],
   source_id => "Source id (unused but sometimes required)",
   species_id => $SPECIES_ID,
   species => $SPECIES_NAME,
@@ -104,17 +104,18 @@ my $wormbase_celegans_xrefs_head= <<EOF;
 // Missing or not applicable data (e.g. protein identifiers for non-coding RNAs) is denoted by a "."
 //
 2L52.1	WBGene00007063	.	2L52.1b	CE50569	BX284602	CELE_2L52.1	CTQ86426	A0A0K3AWR5
-2L52.1	WBGene00007063	.	2L52.1a	CE32090	BX284602	CELE_2L52.1	CCD61130	A4F336
+2L52.1	WBGene00007063	.	2L52.1a.1	CE32090	BX284602	CELE_2L52.1	CCD61130	A4F336
+2L52.1	WBGene00007063	.	2L52.1a.2	CE32090	BX284602	CELE_2L52.1	CCD61130	A4F336
 2L52.2	WBGene00200402	.	2L52.2	.	BX284602	CELE_2L52.2	.	.
 EOF
 my $wormbase_celegans_xrefs_expected_count = {
-xref=>11,
+xref=>14,
 gene_direct_xref => 4,
-transcript_direct_xref => 3,
-translation_direct_xref => 4,
+transcript_direct_xref => 7,
+translation_direct_xref => 6,
 };
 test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
-   $wormbase_celegans_xrefs_expected_count, "Direct xrefs: genes: columns 1,2,3, transcripts: column 4, translations: column 5 and 7. xrefs: sum of these "
+   $wormbase_celegans_xrefs_expected_count, "Direct xrefs: genes: columns 1,2,3, transcripts: column 4 once as transcript and also once as CDS for coding genes, translations: column 5 and 7. xrefs: sum of these minus one reused CDS and two reused proteins"
 );

 my $uniprot_elegans_record = <<EOF;
@@ -185,32 +186,47 @@ for my $l (@recognised_sources) {
    primary_xref => 1,
    dependent_xref => 3,
  }, "Pick up as extra xref + dependent xref: $l" );
-} 
-test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record,  {
-}, "No UniProt entries without corresponding INSDC entries");
-
-test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
-  $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
-skip_clean => 1);
+}
+sub test_elegans_uniprot {
+  my ($expected_count, $extra_line) = @_;
+  my $uniprot_elegans_record_here = $uniprot_elegans_record;
+  $uniprot_elegans_record_here =~ s/DR(.*?)\n/DR$1\nDR  $extra_line/ if $extra_line;
+  test_parser("XrefParser::WormbaseCElegansUniProtParser", 
+      $uniprot_elegans_record_here,
+     {}, 
+     "No UniProt entries without corresponding INSDC entries $extra_line"
+  );
+  test_parser("XrefParser::WormbaseDirectParser",
+    $wormbase_celegans_xrefs_head,
+    $wormbase_celegans_xrefs_expected_count,
+    "Test again to set up the next test",
+    skip_clean => 1,
+  );
+  test_parser("XrefParser::WormbaseCElegansUniProtParser",
+    $uniprot_elegans_record_here,
+    $expected_count,
+    "Correct xrefs and dependent xrefs $extra_line",
+  );
+}
 my $wormbase_and_uniprot_expected_count = {
  %$wormbase_celegans_xrefs_expected_count,
-  xref => $wormbase_celegans_xrefs_expected_count->{xref}+1, 
-  dependent_xref => 1 #protein id still there, no parent sequence ID 
+  xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
+  dependent_xref => 1 #protein id still there, no parent sequence ID
 };
-test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record, 
-  $wormbase_and_uniprot_expected_count, "Get counts");
-
+test_elegans_uniprot($wormbase_and_uniprot_expected_count, "");
 for my $l (@recognised_sources) {
-  (my $uniprot_elegans_record_extra_line = $uniprot_elegans_record) =~ s/DR(.*?)\n/DR$1\nDR  $l/;
-  test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
-    $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
-  skip_clean => 1);
-  test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record_extra_line,  {
+  test_elegans_uniprot({
    %$wormbase_and_uniprot_expected_count,
    xref => $wormbase_and_uniprot_expected_count->{xref}+1,
    dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1,
-  }, "Pick up as extra xref + dependent xref: $l"  );
+  },$l);
 }
+test_elegans_uniprot({
+  %$wormbase_and_uniprot_expected_count,
+  dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1,
+  }, "EMBL; BX284602; CCD61130.1; -; Genomic_DNA",
+);
+
 my $refseq_protein_elegans_record = <<EOF;
 LOCUS       NP_493629                427 aa            linear   INV 19-AUG-2018
 DEFINITION  Uncharacterized protein CELE_2L52.1 [Caenorhabditis elegans].
@@ -261,21 +277,91 @@ ORIGIN
      421 fdveigy
 //
 EOF
-test_parser("XrefParser::RefSeqGPFFParser",$refseq_protein_elegans_record, {
-  xref =>1,
-  primary_xref => 1,
-}, "Example RefSeq protein record" , tmp_file_name => "something_that_says_protein");
-test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record, {
-}, "No entries without WormBase records" , tmp_file_name => "something_that_says_protein");

-test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
-    $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
-skip_clean => 1);
-test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record,  {
-  %$wormbase_celegans_xrefs_expected_count,
-  xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
-  dependent_xref => 1,
-}, "RefSeq entries hang off INSDC entries", tmp_file_name => "something_that_says_protein");
+my $refseq_mrna_elegans_record = <<EOF;
+LOCUS       NM_001313558             663 bp    mRNA    linear   INV 19-AUG-2018
+DEFINITION  Caenorhabditis elegans Uncharacterized protein (2L52.1), partial
+            mRNA.
+ACCESSION   NM_001313558
+VERSION     NM_001313558.1
+DBLINK      BioProject: PRJNA158
+            BioSample: SAMEA3138177
+KEYWORDS    RefSeq.
+SOURCE      Caenorhabditis elegans
+  ORGANISM  Caenorhabditis elegans
+            Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida;
+            Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis.
+REFERENCE   1  (bases 1 to 663)
+  <snipped>
+COMMENT     REVIEWED REFSEQ: This record has been curated by WormBase. This
+            record is derived from an annotated genomic sequence (NC_003280).
+            COMPLETENESS: incomplete on both ends.
+FEATURES             Location/Qualifiers
+     source          1..663
+                     /organism="Caenorhabditis elegans"
+                     /mol_type="mRNA"
+                     /strain="Bristol N2"
+                     /db_xref="taxon:$SPECIES_ID"
+                     /chromosome="II"
+     gene            <1..>663
+                     /gene="2L52.1"
+                     /locus_tag="CELE_2L52.1"
+                     /db_xref="GeneID:181792"
+                     /db_xref="WormBase:WBGene00007063"
+     CDS             1..663
+                     /gene="2L52.1"
+                     /locus_tag="CELE_2L52.1"
+                     /standard_name="2L52.1a"
+                     /note="Confirmed by transcript evidence"
+                     /codon_start=1
+                     /product="hypothetical protein"
+                     /protein_id="NP_001300487.1"
+                     /db_xref="EnsemblGenomes-Gn:WBGene00007063"
+                     /db_xref="EnsemblGenomes-Tr:2L52.1b"
+                     /db_xref="GeneID:181792"
+                     /db_xref="UniProtKB/TrEMBL:A0A0K3AWR5"
+                     /db_xref="WormBase:WBGene00007063"
+                     /translation="MSDNEEVYVNFRGMNCISTGKSASMVPSKRRNWPKRVKKRLSTQ
+                     RNNQKTIRPPELNKNNIEIKDMNSNNLEERNREECIQPVSVEKNILHFEKFKSNQICI
+                     VRENNKFREGTRRRRKNSGESEDLKIHENFTEKRRPIRSCKQNISFYEMDGDIEEFEV
+                     FFDTPTKSKKVLLDIYSAKKMPKIEVEDSLVNKFHSKRPSRACRVLGSMEEVPFDVEI
+                     GY"
+ORIGIN      
+        1 atgtcagata atgaagaagt atatgtgaac ttccgtggaa tgaactgtat ctcaacagga
+       61 aagtcggcca gtatggtccc gagcaaacga agaaattggc caaaaagagt gaagaaaagg
+      121 ctatcgacac aaagaaacaa tcagaaaact attcgaccac cagagctgaa taaaaataat
+      181 atagagataa aagatatgaa ctcaaataac cttgaagaac gcaacagaga agaatgcatt
+      241 cagcctgttt ctgttgaaaa gaacatcctg cattttgaaa aattcaaatc aaatcaaatt
+      301 tgcattgttc gggaaaacaa taaatttaga gaaggaacga gaagacgcag aaagaattct
+      361 ggtgaatcgg aagacttgaa aattcatgaa aactttactg aaaaacgaag acccattcga
+      421 tcatgcaaac aaaatataag tttctatgaa atggacgggg atatagaaga atttgaagtg
+      481 tttttcgata ctcccacaaa aagcaaaaaa gtacttctgg atatctacag tgcgaagaaa
+      541 atgccaaaaa ttgaggttga agattcatta gttaataagt ttcattcaaa acgtccatca
+      601 agagcatgtc gagttcttgg aagtatggaa gaagtaccat ttgatgtgga aataggatat
+      661 tga
+//
+EOF
+sub test_refseq {
+  my ($record, $type) = @_;
+  my $file = "something_that_says_$type";
+  test_parser("XrefParser::RefSeqGPFFParser",$record, {
+    xref =>1,
+    primary_xref => 1,
+  }, "$type: Example record" , tmp_file_name => $file);
+  test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$record, {
+  }, "$type no entries without WormBase records" , tmp_file_name => $file);
+
+  test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,  
+      $wormbase_celegans_xrefs_expected_count, "$type test again to set up the next test",
+  skip_clean => 1);
+  test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$record,  {
+    %$wormbase_celegans_xrefs_expected_count,
+    xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
+    dependent_xref => 1,
+  }, "$type RefSeq entries hang off INSDC entries", tmp_file_name => $file);
+}
+test_refseq($refseq_protein_elegans_record, "protein");
+test_refseq($refseq_mrna_elegans_record, "mrna");
 done_testing();