diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm index 71d793818afd551b3dbcca71dfef1d304ad0f91f..299cb52e224c5b315f13a3e854f797a2365a5903 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm @@ -185,9 +185,40 @@ sub create_xrefs { while ( $_ = $refseq_io->getline() ) { - my $xref; + my $xref = $self->xref_from_record( + $_, + \%name2species_id, \%taxonomy2species_id, + $pred_mrna_source_id, $pred_ncrna_source_id, + $mrna_source_id, $ncrna_source_id, + $pred_peptide_source_id, $peptide_source_id, + $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth, + $species_id, $type, \%refseq_ids,\%entrez_ids,\%wiki_ids + ); + + push @xrefs, $xref if $xref; - my $entry = $_; + } # while <REFSEQ> + + $refseq_io->close(); + + print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose); + + return \@xrefs; + +} +sub xref_from_record { + my ( $self, $entry, $name2species_id, $taxonomy2species_id, + $pred_mrna_source_id, $pred_ncrna_source_id, + $mrna_source_id, $ncrna_source_id, + $pred_peptide_source_id, $peptide_source_id, + $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth, + $species_id, $type, $refseq_ids,$entrez_ids,$wiki_ids +) = @_; + my %name2species_id = %$name2species_id; + my %taxonomy2species_id = %$taxonomy2species_id; + my %refseq_ids = %$refseq_ids; + my %entrez_ids = %$entrez_ids; + my %wiki_ids = %$wiki_ids; chomp $entry; my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/; @@ -209,6 +240,7 @@ sub create_xrefs { && defined $species_id_check && $species_id == $species_id_check ) { + my $xref = {}; my ($acc) = $entry =~ /ACCESSION\s+(\S+)/; my ($ver) = $entry =~ /VERSION\s+(\S+)/; my ($refseq_pair) = $entry =~ /DBSOURCE\s+REFSEQ: accession (\S+)/; @@ -328,19 +360,8 @@ sub create_xrefs { # Don't add SGD Xrefs, as they are mapped directly from SGD ftp site # Refseq's do not tell whether the mim is for the gene of morbid so ignore for now. - - push @xrefs, $xref; - - }# if defined species - - } # while <REFSEQ> - - $refseq_io->close(); - - print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose); - - return \@xrefs; - + return $xref; + } } # -------------------------------------------------------------------------------- diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm new file mode 100644 index 0000000000000000000000000000000000000000..8339c51a6a3467328a5043a58f10f9d1ead71b01 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm @@ -0,0 +1,44 @@ +=head1 LICENSE + +Copyright [2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=cut + +package XrefParser::WormbaseCElegansBase; + +sub swap_dependency { + my ($self, $source_id, $dbi, $xref, @source_ids_skip) = @_; + + my @matching_source_id_dependents; + my @other_dependents; + for my $dependent_xref (@{$xref->{DEPENDENT_XREFS} || []}){ + my $source_id_here = $dependent_xref->{SOURCE_ID}; + if($source_id_here eq $source_id + and $self->get_xref($dependent_xref->{ACCESSION}, $dependent_xref->{SOURCE_ID}, $xref->{SPECIES_ID})){ + $dependent_xref->{SPECIES_ID} = $xref->{SPECIES_ID}; + push @matching_source_id_dependents, $dependent_xref; + } elsif (grep {$_ == $source_id_here} @source_ids_skip){ + #skip + } else { + push @other_dependents, $dependent_xref; + } + } + return map {{%$_, LABEL=>undef, INFO_TYPE => "MISC", DEPENDENT_XREFS => [{ + %$xref, + INFO_TYPE => "DEPENDENT", + LINKAGE_SOURCE_ID => $source_id, + }, map {{%$_,INFO_TYPE => "DEPENDENT", LINKAGE_SOURCE_ID => $source_id}} @other_dependents]}} @matching_source_id_dependents; +} +1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm new file mode 100644 index 0000000000000000000000000000000000000000..7cae198f10f23a557c8df1a2a3739234d532fd0d --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm @@ -0,0 +1,49 @@ +=head1 LICENSE + +Copyright [2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=cut + +package XrefParser::WormbaseCElegansRefSeqGPFFParser; + +use parent XrefParser::WormbaseCElegansBase, XrefParser::RefSeqGPFFParser; + +my $source_id; + +sub upload_xref_object_graphs { + my ($self, $xrefs, $dbi) = @_; + $source_id //= $self->get_source_id_for_source_name('protein_id'); + my @adapted_xrefs; + for my $xref ( @$xrefs) { + push @adapted_xrefs, $self->swap_dependency($source_id, $dbi, $xref); + } + return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi); +} +sub xref_from_record { + my ($self, $entry, @args) = @_; + + my $xref = $self->SUPER::xref_from_record($entry, @args); + $source_id //= $self->get_source_id_for_source_name('protein_id'); + $entry =~ /This record has been curated by WormBase. The\s+reference sequence is identical to (.*?)\./; + my $insdc_protein_id = $1; + if($insdc_protein_id) { + $xref->{DEPENDENT_XREFS} //= []; + push @{$xref->{DEPENDENT_XREFS}}, {ACCESSION => $insdc_protein_id, SOURCE_ID=>$source_id}; + return $xref; + } else { + return undef; + } +} +1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm new file mode 100644 index 0000000000000000000000000000000000000000..0652bbc6ea6aa0d0ee24233871795c5f3b551abd --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm @@ -0,0 +1,43 @@ +=head1 LICENSE + +Copyright [2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=cut + +package XrefParser::WormbaseCElegansUniProtParser; + +# UniProt xrefs are sometimes - really - dependent xrefs of +# INSDC entries which we get from somewhere else +# Attempt to find the parent (has to already be present in the xref table) +# INSDC and UniProt entries have the same protein sequence, and +# UniProt lists INSDC as a parent. We get INSDC entries from somewhere else, +# so make UniProt entries dependent on INSDC entries. +# Note: +# INSDC entries have coordinates, and UniProt entries don't. +# So for perfect homologs, there can be many INSDC entries per UniProt. + +use parent XrefParser::WormbaseCElegansBase, XrefParser::UniProtParser; + +sub upload_xref_object_graphs { + my ($self, $xrefs, $dbi) = @_; + my $source_id = $self->get_source_id_for_source_name('protein_id'); + my $source_id_skip = $self->get_source_id_for_source_name('EMBL'); + my @adapted_xrefs; + for my $xref ( @$xrefs) { + push @adapted_xrefs, $self->swap_dependency($source_id, $dbi, $xref, $source_id_skip); + } + return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi); +} +1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm index 0d570d862f638facab43264116f0f2fa0bdc0817..92ea687e517c2d4e92df7f3f0f2b28dc7bbd238d 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm @@ -28,34 +28,56 @@ use XrefParser::BaseParser; use base qw( XrefParser::BaseParser ); - sub run { my ($self, $ref_arg) = @_; my $source_id = $ref_arg->{source_id}; my $species_id = $ref_arg->{species_id}; my $files = $ref_arg->{files}; - my $verbose = $ref_arg->{verbose}; if((!defined $source_id) or (!defined $species_id) or (!defined $files)){ croak "Need to pass source_id, species_id and files as pairs"; } - $verbose |=0; my $file = @{$files}[0]; - - my $wormbasegene_src_id = $self->get_source_id_for_source_name('wormbase_gene'); - my $wormbasegseq_src_id = $self->get_source_id_for_source_name('wormbase_gseqname'); - my $wormbaselocus_src_id = $self->get_source_id_for_source_name('wormbase_locus'); - my $wormbasetran_src_id = $self->get_source_id_for_source_name('wormbase_transcript'); - my $wormpep_src_id = $self->get_source_id_for_source_name('wormpep_id'); - - my $xref_wgene_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasegene_src_id AND species_id=$species_id"); - my $xref_gseq_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasegseq_src_id AND species_id=$species_id"); - my $xref_wloc_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbaselocus_src_id AND species_id=$species_id"); - my $xref_wtran_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasetran_src_id AND species_id=$species_id"); - my $xref_wpep_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormpep_src_id AND species_id=$species_id"); - + my @fields = qw/wormbase_gene wormbase_gseqname wormbase_locus wormbase_transcript wormpep_id protein_id/; + my %src_ids; + my $sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=? AND species_id=$species_id"); + for my $field (@fields){ + $src_ids{$field} = $self->get_source_id_for_source_name($field); + } + my $data = $self->get_data(@$files); + for my $gene_id (keys %$data){ + $self->add_xref_and_direct_xref( + $sth, $species_id, "gene", $src_ids{wormbase_gene}, + $gene_id, $gene_id + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "gene", $src_ids{wormbase_gseqname}, + $gene_id, $data->{$gene_id}->{wormbase_gseqname} + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "gene", $src_ids{wormbase_locus}, + $gene_id, $data->{$gene_id}->{wormbase_locus} + ); + for my $transcript (@{$data->{$gene_id}->{transcripts}}){ + $self->add_xref_and_direct_xref( + $sth, $species_id, "transcript", $src_ids{wormbase_transcript}, + $transcript->{transcript_id}, $transcript->{transcript_id} + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "translation", $src_ids{wormpep_id}, + $transcript->{wormpep_id}, $transcript->{wormpep_id}, $transcript->{transcript_id} + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "translation", $src_ids{protein_id}, + $transcript->{protein_id}, $transcript->{protein_id}, $transcript->{transcript_id} + ); + } + } +} +sub get_data { + my ($self, $file) = @_; my $pep_io = $self->get_filehandle($file); if ( !defined $pep_io ) { @@ -63,116 +85,38 @@ sub run { return 1; # 1 error } - my ($x_count, $d_count); - - my (%wbgene2seqid, %wbgene2loc, %tran2wbtran, %tran2wpep); + my $data = {}; while ( $_ = $pep_io->getline() ) { next if /^\/\//; - - my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep) = split(/\t/, $_); - - # Each WBGeneid should have only one sequence name and (optionally) one locus name - $wbgene2seqid{$wbgeneid} = $gseqid; - $wbgene2loc{$wbgeneid} = $locus if $locus ne '.'; - - $tran2wbtran{$wbtranscript} = 1; - $tran2wpep{$wbtranscript} = $wormpep if $wormpep ne '.'; - + my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep, $insdc_parent, $insdc_locus_tag, $protein_id, $uniprot_id) = split(/\t/, $_); + $data->{$wbgeneid}->{transcripts} //=[]; + push @{$data->{$wbgeneid}->{transcripts}}, { + transcript_id => $wbtranscript, + ($wormpep eq '.' ? () : (wormpep_id => $wormpep)), + ($protein_id eq '.' ? () : (protein_id => $protein_id)), + }; + $data->{$wbgeneid}->{wormbase_gseqname} = $gseqid; + $data->{$wbgeneid}->{wormbase_locus} = $locus if $locus ne '.'; } $pep_io->close(); - - foreach my $wbgid (keys %wbgene2seqid) { - # reuse or create xref - $xref_wgene_sth->execute($wbgid); - my $xref_id = ($xref_wgene_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $wbgid, - label => $wbgid, - source_id => $wormbasegene_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - $self->add_direct_xref($xref_id, $wbgid, "gene", ""); - $d_count++; - - my $gseqname = $wbgene2seqid{$wbgid}; - - $xref_gseq_sth->execute($wbgid); - $xref_id = ($xref_gseq_sth->fetchrow_array())[0]; - if (not $xref_id) { - $xref_id = $self->add_xref({ acc => $wbgid, - label => $gseqname, - source_id => $wormbasegseq_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - $self->add_direct_xref($xref_id, $wbgid, "gene", ""); - $d_count++; - - - if (exists $wbgene2loc{$wbgid}) { - my $loc_sym = $wbgene2loc{$wbgid}; - - $xref_wloc_sth->execute($wbgid); - $xref_id = ($xref_wloc_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $wbgid, - label => $loc_sym, - source_id => $wormbaselocus_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - } - - # and direct xref - $self->add_direct_xref($xref_id, $wbgid, "gene", ""); - $d_count++; - } - - - foreach my $tid (keys %tran2wbtran) { - $xref_wtran_sth->execute($tid); - my $xref_id = ($xref_wtran_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $tid, - label => $tid, - source_id => $wormbasetran_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - - # and direct xref - $self->add_direct_xref($xref_id, $tid, "transcript", ""); - $d_count++; - } - - foreach my $tid (keys %tran2wpep) { - my $wpep = $tran2wpep{$tid}; - - $xref_wpep_sth->execute($wpep); - - my $xref_id = ($xref_wpep_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $wpep, - label => $wpep, - source_id => $wormpep_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - - # and direct xref - $self->add_direct_xref($xref_id, $tid, "translation", ""); - $d_count++; - } - - print "Added $d_count direct xrefs and $x_count xrefs\n" if($verbose); - return 0; + return $data; } +sub add_xref_and_direct_xref { + my ($self, $sth, $species_id, $object_type, $source_id, $object_id, $label, $primary_id) = @_; + $primary_id //= $object_id; + return unless $label; + $sth->execute($primary_id, $source_id); + $self->add_direct_xref( + ($sth->fetchrow_array())[0] + || $self->add_xref({ + acc => $object_id, + label => $label, + source_id => $source_id, + species_id => $species_id, + info_type => "DIRECT" + }) + , $primary_id, $object_type, ""); +} 1; diff --git a/modules/t/xref_parser.t b/modules/t/xref_parser.t index 2c77b54ea78293c4083a393f618bd1af2df086fb..95ad36a12780694214f6ba1cb042aefa23a4518d 100644 --- a/modules/t/xref_parser.t +++ b/modules/t/xref_parser.t @@ -55,32 +55,37 @@ my %xref_tables_expected_empty_by_default = ( ); my $tmp_dir = tempdir(CLEANUP=>1); sub store_in_temporary_file { - my $path = "$tmp_dir/tmp"; + my ($content, %opts) = @_; + my $path = join("/", $tmp_dir, $opts{tmp_file_name} || "tmp"); open(my $fh, ">", $path) or die $path; - print $fh @_; + print $fh $content; close($fh); return $path; } +# Happens to match the species id of the core database +my $SPECIES_ID = 1; +my $SPECIES_NAME = "Homo sapiens"; sub test_parser { - my ($parser, $content, $source_id, $expected, $test_name) = @_; + my ($parser, $content, $expected, $test_name, %opts) = @_; require_ok($parser); $parser->new($database)->run({ - files => [store_in_temporary_file($content)], - source_id => $source_id, - species_id => 1 #Happens to be right, but doesn't matter anyway - we are not testing the mapping + files => [store_in_temporary_file($content, %opts)], + source_id => "Source id (unused but sometimes required)", + species_id => $SPECIES_ID, + species => $SPECIES_NAME, }); my $expected_table_counts = {%xref_tables_expected_empty_by_default, %$expected}; subtest "$parser $test_name" => sub { plan tests => scalar(keys %$expected_table_counts); for my $table (keys %$expected_table_counts){ my $actual_count = count_rows($dba, $table); - $dba->dbc->prepare("delete from $table;")->execute() if $actual_count; + $dba->dbc->prepare("delete from $table;")->execute() if ($actual_count and not $opts{skip_clean}); my $expected_count = $expected_table_counts->{$table}; is($actual_count, $expected_count, "$table has $expected_count rows") or diag "$table has $actual_count rows"; } } } -test_parser("XrefParser::WormbaseDirectParser", "", "source_id (unused)", {}, "null case"); +test_parser("XrefParser::WormbaseDirectParser", "", {}, "null case"); my $wormbase_celegans_xrefs_head= <<EOF; // // WormBase Caenorhabditis elegans XREFs for WS265 @@ -102,11 +107,175 @@ my $wormbase_celegans_xrefs_head= <<EOF; 2L52.1 WBGene00007063 . 2L52.1a CE32090 BX284602 CELE_2L52.1 CCD61130 A4F336 2L52.2 WBGene00200402 . 2L52.2 . BX284602 CELE_2L52.2 . . EOF -test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, "source_id (unused)", { -xref=>9, -gene_direct_xref => 6, +my $wormbase_celegans_xrefs_expected_count = { +xref=>11, +gene_direct_xref => 4, transcript_direct_xref => 3, -translation_direct_xref => 2, -}, "Direct xrefs: genes: count currently off due to some questionable duplicates, transcripts: as in column 4, translations: as in column 5. At least one direct xref per xref (but should be one to one)"); +translation_direct_xref => 4, +}; +test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, + $wormbase_celegans_xrefs_expected_count, "Direct xrefs: genes: columns 1,2,3, transcripts: column 4, translations: column 5 and 7. xrefs: sum of these " +); + +my $uniprot_elegans_record = <<EOF; +ID A0A0K3AWR5_CAEEL Unreviewed; 220 AA. +AC A0A0K3AWR5; +DT 11-NOV-2015, integrated into UniProtKB/TrEMBL. +DT 11-NOV-2015, sequence version 1. +DT 18-JUL-2018, entry version 14. +DE SubName: Full=Uncharacterized protein {ECO:0000313|EMBL:CTQ86426.1}; +GN ORFNames=2L52.1 {ECO:0000313|EMBL:CTQ86426.1, +GN ECO:0000313|WormBase:2L52.1b}, +GN CELE_2L52.1 {ECO:0000313|EMBL:CTQ86426.1}; +OS $SPECIES_NAME. +OC Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida; +OC Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis. +OX NCBI_TaxID=$SPECIES_ID {ECO:0000313|EMBL:CTQ86426.1, ECO:0000313|Proteomes:UP000001940}; +RN [1] {ECO:0000313|EMBL:CTQ86426.1, ECO:0000313|Proteomes:UP000001940} +RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. +RC STRAIN=Bristol N2 {ECO:0000313|EMBL:CTQ86426.1, +RC ECO:0000313|Proteomes:UP000001940}; +RX PubMed=9851916; DOI=https://doi.org/10.1126/science.282.5396.2012; +RG The C. elegans sequencing consortium; +RA Sulson J.E., Waterston R.; +RT "Genome sequence of the nematode C. elegans: a platform for +RT investigating biology."; +RL Science 282:2012-2018(1998). +CC ----------------------------------------------------------------------- +CC Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms +CC Distributed under the Creative Commons Attribution (CC BY 4.0) License +CC ----------------------------------------------------------------------- +DR EMBL; BX284602; CTQ86426.1; -; Genomic_DNA. +DR RefSeq; NP_001300487.1; NM_001313558.1. +DR UniGene; Cel.25279; -. +DR EnsemblMetazoa; 2L52.1b; 2L52.1b; WBGene00007063. +DR GeneID; 181792; -. +DR CTD; 181792; -. +DR WormBase; 2L52.1b; CE50569; WBGene00007063; -. +DR Proteomes; UP000001940; Chromosome II. +DR ExpressionAtlas; A0A0K3AWR5; baseline and differential. +PE 4: Predicted; +KW Complete proteome {ECO:0000313|Proteomes:UP000001940}; +KW Reference proteome {ECO:0000313|Proteomes:UP000001940}. +SQ SEQUENCE 220 AA; 26028 MW; E12D5EA7F6FFF373 CRC64; + MSDNEEVYVN FRGMNCISTG KSASMVPSKR RNWPKRVKKR LSTQRNNQKT IRPPELNKNN + IEIKDMNSNN LEERNREECI QPVSVEKNIL HFEKFKSNQI CIVRENNKFR EGTRRRRKNS + GESEDLKIHE NFTEKRRPIR SCKQNISFYE MDGDIEEFEV FFDTPTKSKK VLLDIYSAKK + MPKIEVEDSL VNKFHSKRPS RACRVLGSME EVPFDVEIGY +// +EOF +test_parser("XrefParser::UniProtParser", $uniprot_elegans_record, { + xref => 3, + primary_xref => 1, + dependent_xref => 2, +},"Example UniProt record"); +(my $uniprot_elegans_record_embl = $uniprot_elegans_record) =~ s/DR EMBL;.*?\n//; +test_parser("XrefParser::UniProtParser", $uniprot_elegans_record_embl, { + xref => 1, + primary_xref => 1, +},"EMBL entries are the dependent xrefs"); +my @recognised_sources = ( + "PDB; 3HRI; X-ray; 2.85 A; A/B/C/D/E/F=44-477.", + "MEROPS; C26.956; -.", +); +for my $l (@recognised_sources) { + (my $uniprot_elegans_record_extra_line = $uniprot_elegans_record) =~ s/DR(.*?)\n/DR$1\nDR $l/; + test_parser("XrefParser::UniProtParser", $uniprot_elegans_record_extra_line, { + xref => 4, + primary_xref => 1, + dependent_xref => 3, + }, "Pick up as extra xref + dependent xref: $l" ); +} +test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record, { +}, "No UniProt entries without corresponding INSDC entries"); + +test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, + $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test", +skip_clean => 1); +my $wormbase_and_uniprot_expected_count = { + %$wormbase_celegans_xrefs_expected_count, + xref => $wormbase_celegans_xrefs_expected_count->{xref}+1, + dependent_xref => 1 #protein id still there, no parent sequence ID +}; +test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record, + $wormbase_and_uniprot_expected_count, "Get counts"); + +for my $l (@recognised_sources) { + (my $uniprot_elegans_record_extra_line = $uniprot_elegans_record) =~ s/DR(.*?)\n/DR$1\nDR $l/; + test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, + $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test", + skip_clean => 1); + test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record_extra_line, { + %$wormbase_and_uniprot_expected_count, + xref => $wormbase_and_uniprot_expected_count->{xref}+1, + dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1, + }, "Pick up as extra xref + dependent xref: $l" ); +} +my $refseq_protein_elegans_record = <<EOF; +LOCUS NP_493629 427 aa linear INV 19-AUG-2018 +DEFINITION Uncharacterized protein CELE_2L52.1 [Caenorhabditis elegans]. +ACCESSION NP_493629 +VERSION NP_493629.2 +DBLINK BioProject: PRJNA158 + BioSample: SAMEA3138177 +DBSOURCE REFSEQ: accession NM_061228.2 +KEYWORDS RefSeq. +SOURCE Caenorhabditis elegans + ORGANISM Caenorhabditis elegans + Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida; + Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis. +REFERENCE + <snipped> +COMMENT REVIEWED REFSEQ: This record has been curated by WormBase. The + reference sequence is identical to CCD61130. +FEATURES Location/Qualifiers + source 1..427 + /organism="Caenorhabditis elegans" + /strain="Bristol N2" + /db_xref="taxon:$SPECIES_ID" + /chromosome="II" + Protein 1..427 + /product="hypothetical protein" + /calculated_mol_wt=49887 + CDS 1..427 + /gene="2L52.1" + /locus_tag="CELE_2L52.1" + /standard_name="2L52.1a" + /coded_by="NM_061228.2:1..1284" + /note="Confirmed by transcript evidence" + /db_xref="EnsemblGenomes-Gn:WBGene00007063" + /db_xref="EnsemblGenomes-Tr:2L52.1a" + /db_xref="GeneID:181792" + /db_xref="GOA:A4F336" + /db_xref="InterPro:IPR013087" + /db_xref="UniProtKB/TrEMBL:A4F336" + /db_xref="WormBase:WBGene00007063" +ORIGIN + 1 msmvrnvsnq sekleilsck wvgclkstev fktveklldh vtadhipevi vnddgseevv + 61 cqwdccemga srgnlqkkke wmenhfktrh vrkakifkcl iedcpvvkss sqeiethlri + 121 shpinpkker lkefksstdh ieptqanrvw tivngevqwk tpprvkkktv iyyddgpryv + 181 fptgcarcny dsdeselesd efwsatemsd neevyvnfrg mncistgksa smvpskrrnw + 241 pkrvkkrlst qrnnqktirp pelnknniei kdmnsnnlee rnreeciqpv sveknilhfe + 301 kfksnqiciv rennkfregt rrrrknsges edlkihenft ekrrpirsck qnisfyemdg + 361 dieefevffd tptkskkvll diysakkmpk ievedslvnk fhskrpsrac rvlgsmeevp + 421 fdveigy +// +EOF +test_parser("XrefParser::RefSeqGPFFParser",$refseq_protein_elegans_record, { + xref =>1, + primary_xref => 1, +}, "Example RefSeq protein record" , tmp_file_name => "something_that_says_protein"); +test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record, { +}, "No entries without WormBase records" , tmp_file_name => "something_that_says_protein"); + +test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, + $wormbase_celegans_xrefs_expected_count, "Test again to set up the next test", +skip_clean => 1); +test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record, { + %$wormbase_celegans_xrefs_expected_count, + xref => $wormbase_celegans_xrefs_expected_count->{xref}+1, + dependent_xref => 1, +}, "RefSeq entries hang off INSDC entries", tmp_file_name => "something_that_says_protein"); done_testing(); +