diff --git a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm index 71d793818afd551b3dbcca71dfef1d304ad0f91f..d04724738b66d942d51430ceb782d84de823687e 100644 --- a/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/RefSeqGPFFParser.pm @@ -135,7 +135,7 @@ sub create_xrefs { # Create a hash of all valid names and taxon_ids for this species my %species2name = $self->species_id2name($dbi); if (defined $species_name) { push @{$species2name{$species_id}}, $species_name; } - if (!defined $species2name{$species_id}) { next; } + if (!defined $species2name{$species_id}) { return; } my %species2tax = $self->species_id2taxonomy($dbi); push @{$species2tax{$species_id}}, $species_id; my @names = @{$species2name{$species_id}}; @@ -164,30 +164,48 @@ sub create_xrefs { local $/ = "\/\/\n"; - my $type; - if ($file =~ /protein/) { + my $type = $self->type_from_file($file); + return unless $type; - $type = 'peptide'; + while ( my $entry = $refseq_io->getline() ) { - } elsif ($file =~ /rna/) { + my $xref = $self->xref_from_record( + $entry, + \%name2species_id, \%taxonomy2species_id, + $pred_mrna_source_id, $pred_ncrna_source_id, + $mrna_source_id, $ncrna_source_id, + $pred_peptide_source_id, $peptide_source_id, + $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth, + $species_id, $type, \%refseq_ids,\%entrez_ids,\%wiki_ids + ); - $type = 'dna'; + push @xrefs, $xref if $xref; - } elsif($file =~ /RefSeq_protein/){ - - $type = 'peptide'; - - }else{ - print STDERR "Could not work out sequence type for $file\n"; - return; - } + } # while <REFSEQ> + $refseq_io->close(); - while ( $_ = $refseq_io->getline() ) { + print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose); - my $xref; + return \@xrefs; - my $entry = $_; +} +sub type_from_file { + my ($self, $file) = @_; + return 'peptide' if $file =~ /RefSeq_protein/; + return 'dna' if $file =~ /rna/; + return 'peptide' if $file =~ /protein/; + print STDERR "Could not work out sequence type for $file\n"; + return; +} +sub xref_from_record { + my ( $self, $entry, $name2species_id, $taxonomy2species_id, + $pred_mrna_source_id, $pred_ncrna_source_id, + $mrna_source_id, $ncrna_source_id, + $pred_peptide_source_id, $peptide_source_id, + $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth, + $species_id, $type, $refseq_ids,$entrez_ids,$wiki_ids +) = @_; chomp $entry; my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/; @@ -196,12 +214,12 @@ sub create_xrefs { $species =~ s/\s*\(.+\)//; # Ditch anything in parens $species =~ s/\s+/_/g; $species =~ s/\n//g; - my $species_id_check = $name2species_id{$species}; + my $species_id_check = $name2species_id->{$species}; # Try going through the taxon ID if species check didn't work. if ( !defined $species_id_check ) { my ($taxon_id) = $entry =~ /db_xref="taxon:(\d+)"/; - $species_id_check = $taxonomy2species_id{$taxon_id}; + $species_id_check = $taxonomy2species_id->{$taxon_id}; } # skip xrefs for species that aren't in the species table @@ -209,6 +227,7 @@ sub create_xrefs { && defined $species_id_check && $species_id == $species_id_check ) { + my $xref = {}; my ($acc) = $entry =~ /ACCESSION\s+(\S+)/; my ($ver) = $entry =~ /VERSION\s+(\S+)/; my ($refseq_pair) = $entry =~ /DBSOURCE\s+REFSEQ: accession (\S+)/; @@ -243,7 +262,7 @@ sub create_xrefs { $description =~ s/\s+/ /g; $description = substr($description, 0, 255) if (length($description) > 255); - my ($seq) = $_ =~ /^\s*ORIGIN\s+(.+)/ms; # /s allows . to match newline + my ($seq) = $entry =~ /^\s*ORIGIN\s+(.+)/ms; # /s allows . to match newline my @seq_lines = split /\n/, $seq; my $parsed_seq = ""; foreach my $x (@seq_lines) { @@ -311,12 +330,12 @@ sub create_xrefs { # Add xrefs for RefSeq mRNA as well where available $refseq_pair =~ s/\.[0-9]*// if $refseq_pair; if (defined $refseq_pair) { - if ($refseq_ids{$refseq_pair}) { - foreach my $refseq_id (@{ $refseq_ids{$refseq_pair} }) { - foreach my $entrez_id (@{ $entrez_ids{$ll} }) { + if ($refseq_ids->{$refseq_pair}) { + foreach my $refseq_id (@{ $refseq_ids->{$refseq_pair} }) { + foreach my $entrez_id (@{ $entrez_ids->{$ll} }) { $add_dependent_xref_sth->execute($refseq_id, $entrez_id); } - foreach my $wiki_id (@{ $wiki_ids{$ll} }) { + foreach my $wiki_id (@{ $wiki_ids->{$ll} }) { $add_dependent_xref_sth->execute($refseq_id, $wiki_id); } } @@ -328,19 +347,8 @@ sub create_xrefs { # Don't add SGD Xrefs, as they are mapped directly from SGD ftp site # Refseq's do not tell whether the mim is for the gene of morbid so ignore for now. - - push @xrefs, $xref; - - }# if defined species - - } # while <REFSEQ> - - $refseq_io->close(); - - print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose); - - return \@xrefs; - + return $xref; + } } # -------------------------------------------------------------------------------- diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm new file mode 100644 index 0000000000000000000000000000000000000000..53558bddbdae544f6cf7d5247a0b52162a448136 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansBase.pm @@ -0,0 +1,67 @@ +=head1 LICENSE + +Copyright [2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=cut +use strict; +use warnings; +package XrefParser::WormbaseCElegansBase; + +sub swap_dependency { + my ($self, $source_ids, $dbi, $xref, @source_ids_skip) = @_; + $source_ids = [$source_ids] unless ref $source_ids eq 'ARRAY'; + + my @matching_source_id_dependents; + my @other_dependents; + for my $dependent_xref (@{$xref->{DEPENDENT_XREFS} || []}){ + my $source_id_here = $dependent_xref->{SOURCE_ID}; + if(grep {$_ == $source_id_here } @$source_ids + and $self->get_xref($dependent_xref->{ACCESSION}, $dependent_xref->{SOURCE_ID}, $xref->{SPECIES_ID})){ + $dependent_xref->{SPECIES_ID} = $xref->{SPECIES_ID}; + push @matching_source_id_dependents, $dependent_xref; + } elsif (grep {$_ == $source_id_here} @source_ids_skip){ + #skip + } else { + push @other_dependents, $dependent_xref; + } + } + my @result; + for my $matching_source_id_dependent (@matching_source_id_dependents) { + my $source_id = $matching_source_id_dependent->{SOURCE_ID}; + my $xref_as_dependent_here = { + %$xref, INFO_TYPE => "DEPENDENT", + LINKAGE_SOURCE_ID => $source_id, + DEPENDENT_XREFS => undef, + }; + my @dependents_here = ({ + %$xref, INFO_TYPE => "DEPENDENT", + LINKAGE_SOURCE_ID => $source_id, + DEPENDENT_XREFS => undef, + }); + for my $d (@other_dependents){ + push @dependents_here, { + %$d, INFO_TYPE => "DEPENDENT", LINKAGE_SOURCE_ID => $source_id, + }; + } + push @result, { + %$matching_source_id_dependent, + LABEL=>undef, INFO_TYPE => "MISC", + DEPENDENT_XREFS =>\@dependents_here, + }; + } + + return @result; +} +1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm new file mode 100644 index 0000000000000000000000000000000000000000..da456fc3fc6115632e4b9876efcc2a570b59c458 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansRefSeqGPFFParser.pm @@ -0,0 +1,68 @@ +=head1 LICENSE + +Copyright [2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=cut + +package XrefParser::WormbaseCElegansRefSeqGPFFParser; +use strict; +use warnings; +use parent qw/XrefParser::WormbaseCElegansBase XrefParser::RefSeqGPFFParser/; +my $SOURCE_IDS; +my $ACCESSION_FROM_ENTRY_PATTERN; +sub run { + my ($self, $arg_ref) = @_; + my $type = $self->type_from_file(@{$arg_ref->{files}}); + if($type eq 'peptide'){ + $SOURCE_IDS = [ $self->get_source_id_for_source_name('protein_id') ]; + $ACCESSION_FROM_ENTRY_PATTERN = qr/This record has been curated by WormBase. The\s+reference sequence is identical to (.*?)\./; + } elsif ($type eq 'dna'){ + $SOURCE_IDS = [ + $self->get_source_id_for_source_name('wormbase_cds'), + $self->get_source_id_for_source_name('wormbase_transcript'), + ]; + $ACCESSION_FROM_ENTRY_PATTERN = qr/standard_name="(.*?)"/; + } + die %$arg_ref unless @$SOURCE_IDS; + return $self->SUPER::run($arg_ref); +} +sub upload_xref_object_graphs { + my ($self, $xrefs, $dbi) = @_; + my @adapted_xrefs; + for my $xref ( @$xrefs) { + push @adapted_xrefs, $self->swap_dependency($SOURCE_IDS, $dbi, $xref, @$SOURCE_IDS); + } + return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi); +} + +sub xref_from_record { + my ($self, $entry, @args) = @_; + return &modify_xref_with_dependent( + $SOURCE_IDS, $entry, + $self->SUPER::xref_from_record($entry, @args), + $ACCESSION_FROM_ENTRY_PATTERN, + ); +} + +sub modify_xref_with_dependent { + my ($source_ids, $entry, $xref, $get_accession_pattern) = @_; + return unless $xref; + my ($accession) = $entry =~ $get_accession_pattern; + return unless $accession; + $xref->{DEPENDENT_XREFS} //= []; + push @{$xref->{DEPENDENT_XREFS}}, map {{ACCESSION => $accession, SOURCE_ID=>$_}} @$source_ids; + return $xref; +} +1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm new file mode 100644 index 0000000000000000000000000000000000000000..68b2c7e45b4109104466d81d250a7236836538d0 --- /dev/null +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseCElegansUniProtParser.pm @@ -0,0 +1,44 @@ +=head1 LICENSE + +Copyright [2018] EMBL-European Bioinformatics Institute + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=cut +use strict; +use warnings; +package XrefParser::WormbaseCElegansUniProtParser; + +# UniProt xrefs are sometimes - really - dependent xrefs of +# INSDC entries which we get from somewhere else +# Attempt to find the parent (has to already be present in the xref table) +# INSDC and UniProt entries have the same protein sequence, and +# UniProt lists INSDC as a parent. We get INSDC entries from somewhere else, +# so make UniProt entries dependent on INSDC entries. +# Note: +# INSDC entries have coordinates, and UniProt entries don't. +# So for perfect homologs, there can be many INSDC entries per UniProt. + +use parent qw/XrefParser::WormbaseCElegansBase XrefParser::UniProtParser/; + +sub upload_xref_object_graphs { + my ($self, $xrefs, $dbi) = @_; + my $source_id = $self->get_source_id_for_source_name('protein_id'); + my $source_id_skip = $self->get_source_id_for_source_name('EMBL'); + my @adapted_xrefs; + for my $xref ( @$xrefs) { + push @adapted_xrefs, $self->swap_dependency($source_id, $dbi, $xref, $source_id_skip); + } + return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi); +} +1; diff --git a/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm b/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm index 0d570d862f638facab43264116f0f2fa0bdc0817..7b5e1d88ed1ebb80141d73727a543448a6ad5a7f 100644 --- a/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm +++ b/misc-scripts/xref_mapping/XrefParser/WormbaseDirectParser.pm @@ -28,151 +28,96 @@ use XrefParser::BaseParser; use base qw( XrefParser::BaseParser ); - sub run { my ($self, $ref_arg) = @_; my $source_id = $ref_arg->{source_id}; my $species_id = $ref_arg->{species_id}; my $files = $ref_arg->{files}; - my $verbose = $ref_arg->{verbose}; if((!defined $source_id) or (!defined $species_id) or (!defined $files)){ croak "Need to pass source_id, species_id and files as pairs"; } - $verbose |=0; my $file = @{$files}[0]; - - my $wormbasegene_src_id = $self->get_source_id_for_source_name('wormbase_gene'); - my $wormbasegseq_src_id = $self->get_source_id_for_source_name('wormbase_gseqname'); - my $wormbaselocus_src_id = $self->get_source_id_for_source_name('wormbase_locus'); - my $wormbasetran_src_id = $self->get_source_id_for_source_name('wormbase_transcript'); - my $wormpep_src_id = $self->get_source_id_for_source_name('wormpep_id'); - - my $xref_wgene_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasegene_src_id AND species_id=$species_id"); - my $xref_gseq_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasegseq_src_id AND species_id=$species_id"); - my $xref_wloc_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbaselocus_src_id AND species_id=$species_id"); - my $xref_wtran_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormbasetran_src_id AND species_id=$species_id"); - my $xref_wpep_sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=$wormpep_src_id AND species_id=$species_id"); - - my $pep_io = $self->get_filehandle($file); - - if ( !defined $pep_io ) { - print STDERR "ERROR: Could not open $file\n"; - return 1; # 1 error + my @fields = qw/wormbase_gene wormbase_gseqname wormbase_locus wormbase_transcript wormbase_cds wormpep_id protein_id/; + my %src_ids; + my $sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=? AND species_id=$species_id"); + for my $field (@fields){ + $src_ids{$field} = $self->get_source_id_for_source_name($field); } + my $data = $self->get_data(@$files); + for my $gene_id (keys %$data){ + $self->add_xref_and_direct_xref( + $sth, $species_id, "gene", $src_ids{wormbase_gene}, + $gene_id, $gene_id + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "gene", $src_ids{wormbase_gseqname}, + $gene_id, $data->{$gene_id}->{wormbase_gseqname} + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "gene", $src_ids{wormbase_locus}, + $gene_id, $data->{$gene_id}->{wormbase_locus} + ); + for my $transcript (@{$data->{$gene_id}->{transcripts}}){ + $self->add_xref_and_direct_xref( + $sth, $species_id, "transcript", $src_ids{wormbase_transcript}, + $transcript->{transcript_id}, $transcript->{transcript_id} + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "transcript", $src_ids{wormbase_cds}, + $transcript->{wormbase_cds}, $transcript->{wormbase_cds}, $transcript->{transcript_id} + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "translation", $src_ids{wormpep_id}, + $transcript->{wormpep_id}, $transcript->{wormpep_id}, $transcript->{transcript_id} + ); + $self->add_xref_and_direct_xref( + $sth, $species_id, "translation", $src_ids{protein_id}, + $transcript->{protein_id}, $transcript->{protein_id}, $transcript->{transcript_id} + ); + } + } +} +sub get_data { + my ($self, $file) = @_; + my $pep_io = $self->get_filehandle($file) or croak "Could not open: $file"; - my ($x_count, $d_count); - - my (%wbgene2seqid, %wbgene2loc, %tran2wbtran, %tran2wpep); + my $data = {}; while ( $_ = $pep_io->getline() ) { next if /^\/\//; - - my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep) = split(/\t/, $_); - - # Each WBGeneid should have only one sequence name and (optionally) one locus name - $wbgene2seqid{$wbgeneid} = $gseqid; - $wbgene2loc{$wbgeneid} = $locus if $locus ne '.'; - - $tran2wbtran{$wbtranscript} = 1; - $tran2wpep{$wbtranscript} = $wormpep if $wormpep ne '.'; - + my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep, $insdc_parent, $insdc_locus_tag, $protein_id, $uniprot_id) = split(/\t/, $_); + + $data->{$wbgeneid}->{transcripts} //=[]; + push @{$data->{$wbgeneid}->{transcripts}}, { + transcript_id => $wbtranscript, + ($wormpep ne '.' && $wbtranscript =~ /^(.*?)(\.\d+)?$/ ? (wormbase_cds => $1 ) : ()), + ($wormpep ne '.' ? (wormpep_id => $wormpep) : ()), + ($protein_id ne '.' ? (protein_id => $protein_id) : ()), + }; + $data->{$wbgeneid}->{wormbase_gseqname} = $gseqid; + $data->{$wbgeneid}->{wormbase_locus} = $locus if $locus ne '.'; } $pep_io->close(); - - foreach my $wbgid (keys %wbgene2seqid) { - # reuse or create xref - $xref_wgene_sth->execute($wbgid); - my $xref_id = ($xref_wgene_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $wbgid, - label => $wbgid, - source_id => $wormbasegene_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - $self->add_direct_xref($xref_id, $wbgid, "gene", ""); - $d_count++; - - my $gseqname = $wbgene2seqid{$wbgid}; - - $xref_gseq_sth->execute($wbgid); - $xref_id = ($xref_gseq_sth->fetchrow_array())[0]; - if (not $xref_id) { - $xref_id = $self->add_xref({ acc => $wbgid, - label => $gseqname, - source_id => $wormbasegseq_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - $self->add_direct_xref($xref_id, $wbgid, "gene", ""); - $d_count++; - - - if (exists $wbgene2loc{$wbgid}) { - my $loc_sym = $wbgene2loc{$wbgid}; - - $xref_wloc_sth->execute($wbgid); - $xref_id = ($xref_wloc_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $wbgid, - label => $loc_sym, - source_id => $wormbaselocus_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - } - - # and direct xref - $self->add_direct_xref($xref_id, $wbgid, "gene", ""); - $d_count++; - } - - - foreach my $tid (keys %tran2wbtran) { - $xref_wtran_sth->execute($tid); - my $xref_id = ($xref_wtran_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $tid, - label => $tid, - source_id => $wormbasetran_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - - # and direct xref - $self->add_direct_xref($xref_id, $tid, "transcript", ""); - $d_count++; - } - - foreach my $tid (keys %tran2wpep) { - my $wpep = $tran2wpep{$tid}; - - $xref_wpep_sth->execute($wpep); - - my $xref_id = ($xref_wpep_sth->fetchrow_array())[0]; - if (!$xref_id) { - $xref_id = $self->add_xref({ acc => $wpep, - label => $wpep, - source_id => $wormpep_src_id, - species_id => $species_id, - info_type => "DIRECT"} ); - $x_count++; - } - - # and direct xref - $self->add_direct_xref($xref_id, $tid, "translation", ""); - $d_count++; - } - - print "Added $d_count direct xrefs and $x_count xrefs\n" if($verbose); - return 0; + return $data; } +sub add_xref_and_direct_xref { + my ($self, $sth, $species_id, $object_type, $source_id, $object_id, $label, $primary_id) = @_; + $primary_id //= $object_id; + return unless $label; + $sth->execute($primary_id, $source_id); + $self->add_direct_xref( + ($sth->fetchrow_array())[0] + || $self->add_xref({ + acc => $object_id, + label => $label, + source_id => $source_id, + species_id => $species_id, + info_type => "DIRECT" + }) + , $primary_id, $object_type, ""); +} 1; diff --git a/misc-scripts/xref_mapping/xref_config.ini b/misc-scripts/xref_mapping/xref_config.ini index 69889a109487bb49b4fcd8204f121f1878ba6c5c..fe7c917c79d9953d6d4630ed68ac13435511716b 100644 --- a/misc-scripts/xref_mapping/xref_config.ini +++ b/misc-scripts/xref_mapping/xref_config.ini @@ -1652,6 +1652,14 @@ order = 50 priority = 1 parser = comes from WormbaseDirectParser +[source wormbase_cds::wormbase] +# Used by wormbase core species +name = wormbase_cds +download = N +order = 50 +priority = 1 +parser = comes from WormbaseDirectParser + [source Gramene_Pathway::arabidopsis_thaliana] # Used by Arabidopsis thaliana, Gramene-specific name = Gramene_Pathway diff --git a/modules/t/xref_parser.t b/modules/t/xref_parser.t index 2c77b54ea78293c4083a393f618bd1af2df086fb..03f16c15bb263d87e35f3ebc97112b94fa5f4a9a 100644 --- a/modules/t/xref_parser.t +++ b/modules/t/xref_parser.t @@ -55,32 +55,37 @@ my %xref_tables_expected_empty_by_default = ( ); my $tmp_dir = tempdir(CLEANUP=>1); sub store_in_temporary_file { - my $path = "$tmp_dir/tmp"; + my ($content, %opts) = @_; + my $path = join("/", $tmp_dir, $opts{tmp_file_name} || "tmp"); open(my $fh, ">", $path) or die $path; - print $fh @_; + print $fh $content; close($fh); return $path; } +# Happens to match the species id of the core database +my $SPECIES_ID = 1; +my $SPECIES_NAME = "Homo sapiens"; sub test_parser { - my ($parser, $content, $source_id, $expected, $test_name) = @_; + my ($parser, $content, $expected, $test_name, %opts) = @_; require_ok($parser); $parser->new($database)->run({ - files => [store_in_temporary_file($content)], - source_id => $source_id, - species_id => 1 #Happens to be right, but doesn't matter anyway - we are not testing the mapping + files =>[store_in_temporary_file($content, %opts)], + source_id => "Source id (unused but sometimes required)", + species_id => $SPECIES_ID, + species => $SPECIES_NAME, }); my $expected_table_counts = {%xref_tables_expected_empty_by_default, %$expected}; subtest "$parser $test_name" => sub { plan tests => scalar(keys %$expected_table_counts); for my $table (keys %$expected_table_counts){ my $actual_count = count_rows($dba, $table); - $dba->dbc->prepare("delete from $table;")->execute() if $actual_count; + $dba->dbc->prepare("delete from $table;")->execute() if ($actual_count and not $opts{skip_clean}); my $expected_count = $expected_table_counts->{$table}; is($actual_count, $expected_count, "$table has $expected_count rows") or diag "$table has $actual_count rows"; } } } -test_parser("XrefParser::WormbaseDirectParser", "", "source_id (unused)", {}, "null case"); +test_parser("XrefParser::WormbaseDirectParser", "", {}, "null case"); my $wormbase_celegans_xrefs_head= <<EOF; // // WormBase Caenorhabditis elegans XREFs for WS265 @@ -99,14 +104,264 @@ my $wormbase_celegans_xrefs_head= <<EOF; // Missing or not applicable data (e.g. protein identifiers for non-coding RNAs) is denoted by a "." // 2L52.1 WBGene00007063 . 2L52.1b CE50569 BX284602 CELE_2L52.1 CTQ86426 A0A0K3AWR5 -2L52.1 WBGene00007063 . 2L52.1a CE32090 BX284602 CELE_2L52.1 CCD61130 A4F336 +2L52.1 WBGene00007063 . 2L52.1a.1 CE32090 BX284602 CELE_2L52.1 CCD61130 A4F336 +2L52.1 WBGene00007063 . 2L52.1a.2 CE32090 BX284602 CELE_2L52.1 CCD61130 A4F336 2L52.2 WBGene00200402 . 2L52.2 . BX284602 CELE_2L52.2 . . EOF -test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, "source_id (unused)", { -xref=>9, -gene_direct_xref => 6, -transcript_direct_xref => 3, -translation_direct_xref => 2, -}, "Direct xrefs: genes: count currently off due to some questionable duplicates, transcripts: as in column 4, translations: as in column 5. At least one direct xref per xref (but should be one to one)"); +my $wormbase_celegans_xrefs_expected_count = { +xref=>14, +gene_direct_xref => 4, +transcript_direct_xref => 7, +translation_direct_xref => 6, +}; +test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, + $wormbase_celegans_xrefs_expected_count, "Direct xrefs: genes: columns 1,2,3, transcripts: column 4 once as transcript and also once as CDS for coding genes, translations: column 5 and 7. xrefs: sum of these minus one reused CDS and two reused proteins" +); + +my $uniprot_elegans_record = <<EOF; +ID A0A0K3AWR5_CAEEL Unreviewed; 220 AA. +AC A0A0K3AWR5; +DT 11-NOV-2015, integrated into UniProtKB/TrEMBL. +DT 11-NOV-2015, sequence version 1. +DT 18-JUL-2018, entry version 14. +DE SubName: Full=Uncharacterized protein {ECO:0000313|EMBL:CTQ86426.1}; +GN ORFNames=2L52.1 {ECO:0000313|EMBL:CTQ86426.1, +GN ECO:0000313|WormBase:2L52.1b}, +GN CELE_2L52.1 {ECO:0000313|EMBL:CTQ86426.1}; +OS $SPECIES_NAME. +OC Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida; +OC Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis. +OX NCBI_TaxID=$SPECIES_ID {ECO:0000313|EMBL:CTQ86426.1, ECO:0000313|Proteomes:UP000001940}; +RN [1] {ECO:0000313|EMBL:CTQ86426.1, ECO:0000313|Proteomes:UP000001940} +RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]. +RC STRAIN=Bristol N2 {ECO:0000313|EMBL:CTQ86426.1, +RC ECO:0000313|Proteomes:UP000001940}; +RX PubMed=9851916; DOI=https://doi.org/10.1126/science.282.5396.2012; +RG The C. elegans sequencing consortium; +RA Sulson J.E., Waterston R.; +RT "Genome sequence of the nematode C. elegans: a platform for +RT investigating biology."; +RL Science 282:2012-2018(1998). +CC ----------------------------------------------------------------------- +CC Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms +CC Distributed under the Creative Commons Attribution (CC BY 4.0) License +CC ----------------------------------------------------------------------- +DR EMBL; BX284602; CTQ86426.1; -; Genomic_DNA. +DR RefSeq; NP_001300487.1; NM_001313558.1. +DR UniGene; Cel.25279; -. +DR EnsemblMetazoa; 2L52.1b; 2L52.1b; WBGene00007063. +DR GeneID; 181792; -. +DR CTD; 181792; -. +DR WormBase; 2L52.1b; CE50569; WBGene00007063; -. +DR Proteomes; UP000001940; Chromosome II. +DR ExpressionAtlas; A0A0K3AWR5; baseline and differential. +PE 4: Predicted; +KW Complete proteome {ECO:0000313|Proteomes:UP000001940}; +KW Reference proteome {ECO:0000313|Proteomes:UP000001940}. +SQ SEQUENCE 220 AA; 26028 MW; E12D5EA7F6FFF373 CRC64; + MSDNEEVYVN FRGMNCISTG KSASMVPSKR RNWPKRVKKR LSTQRNNQKT IRPPELNKNN + IEIKDMNSNN LEERNREECI QPVSVEKNIL HFEKFKSNQI CIVRENNKFR EGTRRRRKNS + GESEDLKIHE NFTEKRRPIR SCKQNISFYE MDGDIEEFEV FFDTPTKSKK VLLDIYSAKK + MPKIEVEDSL VNKFHSKRPS RACRVLGSME EVPFDVEIGY +// +EOF +test_parser("XrefParser::UniProtParser", $uniprot_elegans_record, { + xref => 3, + primary_xref => 1, + dependent_xref => 2, +},"Example UniProt record"); +(my $uniprot_elegans_record_embl = $uniprot_elegans_record) =~ s/DR EMBL;.*?\n//; +test_parser("XrefParser::UniProtParser", $uniprot_elegans_record_embl, { + xref => 1, + primary_xref => 1, +},"EMBL entries are the dependent xrefs"); +my @recognised_sources = ( + "PDB; 3HRI; X-ray; 2.85 A; A/B/C/D/E/F=44-477.", + "MEROPS; C26.956; -.", +); +for my $l (@recognised_sources) { + (my $uniprot_elegans_record_extra_line = $uniprot_elegans_record) =~ s/DR(.*?)\n/DR$1\nDR $l/; + test_parser("XrefParser::UniProtParser", $uniprot_elegans_record_extra_line, { + xref => 4, + primary_xref => 1, + dependent_xref => 3, + }, "Pick up as extra xref + dependent xref: $l" ); +} +sub test_elegans_uniprot { + my ($expected_count, $extra_line) = @_; + my $uniprot_elegans_record_here = $uniprot_elegans_record; + $uniprot_elegans_record_here =~ s/DR(.*?)\n/DR$1\nDR $extra_line/ if $extra_line; + test_parser("XrefParser::WormbaseCElegansUniProtParser", + $uniprot_elegans_record_here, + {}, + "No UniProt entries without corresponding INSDC entries $extra_line" + ); + test_parser("XrefParser::WormbaseDirectParser", + $wormbase_celegans_xrefs_head, + $wormbase_celegans_xrefs_expected_count, + "Test again to set up the next test", + skip_clean => 1, + ); + test_parser("XrefParser::WormbaseCElegansUniProtParser", + $uniprot_elegans_record_here, + $expected_count, + "Correct xrefs and dependent xrefs $extra_line", + ); +} +my $wormbase_and_uniprot_expected_count = { + %$wormbase_celegans_xrefs_expected_count, + xref => $wormbase_celegans_xrefs_expected_count->{xref}+1, + dependent_xref => 1 #protein id still there, no parent sequence ID +}; +test_elegans_uniprot($wormbase_and_uniprot_expected_count, ""); +for my $l (@recognised_sources) { + test_elegans_uniprot({ + %$wormbase_and_uniprot_expected_count, + xref => $wormbase_and_uniprot_expected_count->{xref}+1, + dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1, + },$l); +} +test_elegans_uniprot({ + %$wormbase_and_uniprot_expected_count, + dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1, + }, "EMBL; BX284602; CCD61130.1; -; Genomic_DNA", +); + +my $refseq_protein_elegans_record = <<EOF; +LOCUS NP_493629 427 aa linear INV 19-AUG-2018 +DEFINITION Uncharacterized protein CELE_2L52.1 [Caenorhabditis elegans]. +ACCESSION NP_493629 +VERSION NP_493629.2 +DBLINK BioProject: PRJNA158 + BioSample: SAMEA3138177 +DBSOURCE REFSEQ: accession NM_061228.2 +KEYWORDS RefSeq. +SOURCE Caenorhabditis elegans + ORGANISM Caenorhabditis elegans + Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida; + Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis. +REFERENCE + <snipped> +COMMENT REVIEWED REFSEQ: This record has been curated by WormBase. The + reference sequence is identical to CCD61130. +FEATURES Location/Qualifiers + source 1..427 + /organism="Caenorhabditis elegans" + /strain="Bristol N2" + /db_xref="taxon:$SPECIES_ID" + /chromosome="II" + Protein 1..427 + /product="hypothetical protein" + /calculated_mol_wt=49887 + CDS 1..427 + /gene="2L52.1" + /locus_tag="CELE_2L52.1" + /standard_name="2L52.1a" + /coded_by="NM_061228.2:1..1284" + /note="Confirmed by transcript evidence" + /db_xref="EnsemblGenomes-Gn:WBGene00007063" + /db_xref="EnsemblGenomes-Tr:2L52.1a" + /db_xref="GeneID:181792" + /db_xref="GOA:A4F336" + /db_xref="InterPro:IPR013087" + /db_xref="UniProtKB/TrEMBL:A4F336" + /db_xref="WormBase:WBGene00007063" +ORIGIN + 1 msmvrnvsnq sekleilsck wvgclkstev fktveklldh vtadhipevi vnddgseevv + 61 cqwdccemga srgnlqkkke wmenhfktrh vrkakifkcl iedcpvvkss sqeiethlri + 121 shpinpkker lkefksstdh ieptqanrvw tivngevqwk tpprvkkktv iyyddgpryv + 181 fptgcarcny dsdeselesd efwsatemsd neevyvnfrg mncistgksa smvpskrrnw + 241 pkrvkkrlst qrnnqktirp pelnknniei kdmnsnnlee rnreeciqpv sveknilhfe + 301 kfksnqiciv rennkfregt rrrrknsges edlkihenft ekrrpirsck qnisfyemdg + 361 dieefevffd tptkskkvll diysakkmpk ievedslvnk fhskrpsrac rvlgsmeevp + 421 fdveigy +// +EOF + +my $refseq_mrna_elegans_record = <<EOF; +LOCUS NM_001313558 663 bp mRNA linear INV 19-AUG-2018 +DEFINITION Caenorhabditis elegans Uncharacterized protein (2L52.1), partial + mRNA. +ACCESSION NM_001313558 +VERSION NM_001313558.1 +DBLINK BioProject: PRJNA158 + BioSample: SAMEA3138177 +KEYWORDS RefSeq. +SOURCE Caenorhabditis elegans + ORGANISM Caenorhabditis elegans + Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida; + Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis. +REFERENCE 1 (bases 1 to 663) + <snipped> +COMMENT REVIEWED REFSEQ: This record has been curated by WormBase. This + record is derived from an annotated genomic sequence (NC_003280). + COMPLETENESS: incomplete on both ends. +FEATURES Location/Qualifiers + source 1..663 + /organism="Caenorhabditis elegans" + /mol_type="mRNA" + /strain="Bristol N2" + /db_xref="taxon:$SPECIES_ID" + /chromosome="II" + gene <1..>663 + /gene="2L52.1" + /locus_tag="CELE_2L52.1" + /db_xref="GeneID:181792" + /db_xref="WormBase:WBGene00007063" + CDS 1..663 + /gene="2L52.1" + /locus_tag="CELE_2L52.1" + /standard_name="2L52.1a" + /note="Confirmed by transcript evidence" + /codon_start=1 + /product="hypothetical protein" + /protein_id="NP_001300487.1" + /db_xref="EnsemblGenomes-Gn:WBGene00007063" + /db_xref="EnsemblGenomes-Tr:2L52.1b" + /db_xref="GeneID:181792" + /db_xref="UniProtKB/TrEMBL:A0A0K3AWR5" + /db_xref="WormBase:WBGene00007063" + /translation="MSDNEEVYVNFRGMNCISTGKSASMVPSKRRNWPKRVKKRLSTQ + RNNQKTIRPPELNKNNIEIKDMNSNNLEERNREECIQPVSVEKNILHFEKFKSNQICI + VRENNKFREGTRRRRKNSGESEDLKIHENFTEKRRPIRSCKQNISFYEMDGDIEEFEV + FFDTPTKSKKVLLDIYSAKKMPKIEVEDSLVNKFHSKRPSRACRVLGSMEEVPFDVEI + GY" +ORIGIN + 1 atgtcagata atgaagaagt atatgtgaac ttccgtggaa tgaactgtat ctcaacagga + 61 aagtcggcca gtatggtccc gagcaaacga agaaattggc caaaaagagt gaagaaaagg + 121 ctatcgacac aaagaaacaa tcagaaaact attcgaccac cagagctgaa taaaaataat + 181 atagagataa aagatatgaa ctcaaataac cttgaagaac gcaacagaga agaatgcatt + 241 cagcctgttt ctgttgaaaa gaacatcctg cattttgaaa aattcaaatc aaatcaaatt + 301 tgcattgttc gggaaaacaa taaatttaga gaaggaacga gaagacgcag aaagaattct + 361 ggtgaatcgg aagacttgaa aattcatgaa aactttactg aaaaacgaag acccattcga + 421 tcatgcaaac aaaatataag tttctatgaa atggacgggg atatagaaga atttgaagtg + 481 tttttcgata ctcccacaaa aagcaaaaaa gtacttctgg atatctacag tgcgaagaaa + 541 atgccaaaaa ttgaggttga agattcatta gttaataagt ttcattcaaa acgtccatca + 601 agagcatgtc gagttcttgg aagtatggaa gaagtaccat ttgatgtgga aataggatat + 661 tga +// +EOF +sub test_refseq { + my ($record, $type) = @_; + my $file = "something_that_says_$type"; + test_parser("XrefParser::RefSeqGPFFParser",$record, { + xref =>1, + primary_xref => 1, + }, "$type: Example record" , tmp_file_name => $file); + test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$record, { + }, "$type no entries without WormBase records" , tmp_file_name => $file); + + test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head, + $wormbase_celegans_xrefs_expected_count, "$type test again to set up the next test", + skip_clean => 1); + test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$record, { + %$wormbase_celegans_xrefs_expected_count, + xref => $wormbase_celegans_xrefs_expected_count->{xref}+1, + dependent_xref => 1, + }, "$type RefSeq entries hang off INSDC entries", tmp_file_name => $file); +} +test_refseq($refseq_protein_elegans_record, "protein"); +test_refseq($refseq_mrna_elegans_record, "mrna"); done_testing(); +