Skip to content
Snippets Groups Projects
Commit 7d6346f7 authored by Wojtek Bazant's avatar Wojtek Bazant
Browse files

C. elegans specific parsing of RefSeq_dna file

- New xref: to a WormBase CDS feature
- Modify WormbaseCElegansRefSeqGPFFParser to serve both kinds of files
- extract a utility method from RefSeqGPFFParser
- xref_config.ini stanza for wormbase_cds
- tests for new functionality
parent d66449b6
No related branches found
No related tags found
6 merge requests!296C. elegans references use WormBase mapping to INSDC protein ids,!317Xref parser mgiparser_ccds,!342Feature/schema update 96,!342Feature/schema update 96,!317Xref parser mgiparser_ccds,!296C. elegans references use WormBase mapping to INSDC protein ids
......@@ -164,24 +164,8 @@ sub create_xrefs {
local $/ = "\/\/\n";
my $type;
if ($file =~ /protein/) {
$type = 'peptide';
} elsif ($file =~ /rna/) {
$type = 'dna';
} elsif($file =~ /RefSeq_protein/){
$type = 'peptide';
}else{
print STDERR "Could not work out sequence type for $file\n";
return;
}
my $type = $self->type_from_file($file);
return unless $type;
while ( $_ = $refseq_io->getline() ) {
......@@ -206,6 +190,14 @@ sub create_xrefs {
return \@xrefs;
}
sub type_from_file {
my ($self, $file) = @_;
return 'peptide' if $file =~ /RefSeq_protein/;
return 'dna' if $file =~ /rna/;
return 'peptide' if $file =~ /protein/;
print STDERR "Could not work out sequence type for $file\n";
return undef;
}
sub xref_from_record {
my ( $self, $entry, $name2species_id, $taxonomy2species_id,
$pred_mrna_source_id, $pred_ncrna_source_id,
......
......@@ -19,13 +19,14 @@ limitations under the License.
package XrefParser::WormbaseCElegansBase;
sub swap_dependency {
my ($self, $source_id, $dbi, $xref, @source_ids_skip) = @_;
my ($self, $source_ids, $dbi, $xref, @source_ids_skip) = @_;
$source_ids = [$source_ids] unless ref $source_ids eq 'ARRAY';
my @matching_source_id_dependents;
my @other_dependents;
for my $dependent_xref (@{$xref->{DEPENDENT_XREFS} || []}){
my $source_id_here = $dependent_xref->{SOURCE_ID};
if($source_id_here eq $source_id
if(grep {$_ == $source_id_here } @$source_ids
and $self->get_xref($dependent_xref->{ACCESSION}, $dependent_xref->{SOURCE_ID}, $xref->{SPECIES_ID})){
$dependent_xref->{SPECIES_ID} = $xref->{SPECIES_ID};
push @matching_source_id_dependents, $dependent_xref;
......@@ -35,10 +36,22 @@ sub swap_dependency {
push @other_dependents, $dependent_xref;
}
}
return map {{%$_, LABEL=>undef, INFO_TYPE => "MISC", DEPENDENT_XREFS => [{
%$xref,
INFO_TYPE => "DEPENDENT",
LINKAGE_SOURCE_ID => $source_id,
}, map {{%$_,INFO_TYPE => "DEPENDENT", LINKAGE_SOURCE_ID => $source_id}} @other_dependents]}} @matching_source_id_dependents;
my @result;
for my $matching_source_id_dependent (@matching_source_id_dependents) {
my $source_id = $matching_source_id_dependent->{SOURCE_ID};
my $xref_as_dependent_here = {
%$xref, INFO_TYPE => "DEPENDENT",
LINKAGE_SOURCE_ID => $source_id,
DEPENDENT_XREFS => undef,
};
my @other_dependents_as_dependents_here = map {{%$_,INFO_TYPE => "DEPENDENT", LINKAGE_SOURCE_ID => $source_id}} @other_dependents;
push @result, {
%$matching_source_id_dependent,
LABEL=>undef, INFO_TYPE => "MISC",
DEPENDENT_XREFS => [$xref_as_dependent_here, @other_dependents_as_dependents_here]
};
}
return @result;
}
1;
......@@ -17,33 +17,51 @@ limitations under the License.
=cut
package XrefParser::WormbaseCElegansRefSeqGPFFParser;
use parent XrefParser::WormbaseCElegansBase, XrefParser::RefSeqGPFFParser;
my $source_id;
use strict;
use parent qw/XrefParser::WormbaseCElegansBase XrefParser::RefSeqGPFFParser/;
my $SOURCE_IDS;
my $PATTERN;
sub run {
my ($self, $arg_ref) = @_;
my $type = $self->type_from_file(@{$arg_ref->{files}});
if($type eq 'peptide'){
$SOURCE_IDS = [ $self->get_source_id_for_source_name('protein_id') ];
$PATTERN = qr/This record has been curated by WormBase. The\s+reference sequence is identical to (.*?)\./;
} elsif ($type eq 'dna'){
$SOURCE_IDS = [
$self->get_source_id_for_source_name('wormbase_cds'),
$self->get_source_id_for_source_name('wormbase_transcript'),
];
$PATTERN = qr/standard_name="(.*?)"/;
}
die %$arg_ref unless @$SOURCE_IDS;
return $self->SUPER::run($arg_ref);
}
sub upload_xref_object_graphs {
my ($self, $xrefs, $dbi) = @_;
$source_id //= $self->get_source_id_for_source_name('protein_id');
my @adapted_xrefs;
for my $xref ( @$xrefs) {
push @adapted_xrefs, $self->swap_dependency($source_id, $dbi, $xref);
push @adapted_xrefs, $self->swap_dependency($SOURCE_IDS, $dbi, $xref, @$SOURCE_IDS);
}
return $self->SUPER::upload_xref_object_graphs(\@adapted_xrefs, $dbi);
}
sub xref_from_record {
my ($self, $entry, @args) = @_;
my $xref = $self->SUPER::xref_from_record($entry, @args);
$source_id //= $self->get_source_id_for_source_name('protein_id');
$entry =~ /This record has been curated by WormBase. The\s+reference sequence is identical to (.*?)\./;
my $insdc_protein_id = $1;
if($insdc_protein_id) {
$xref->{DEPENDENT_XREFS} //= [];
push @{$xref->{DEPENDENT_XREFS}}, {ACCESSION => $insdc_protein_id, SOURCE_ID=>$source_id};
return $xref;
} else {
return undef;
}
return &modify_xref_with_dependent(
$SOURCE_IDS, $entry,
$self->SUPER::xref_from_record($entry, @args),
$PATTERN,
);
}
sub modify_xref_with_dependent {
my ($source_ids, $entry, $xref, $pattern) = @_;
return unless $xref;
return unless $entry =~ $pattern;
return unless $1;
$xref->{DEPENDENT_XREFS} //= [];
push @{$xref->{DEPENDENT_XREFS}}, map {{ACCESSION => $1, SOURCE_ID=>$_}} @$source_ids;
return $xref;
}
1;
......@@ -40,7 +40,7 @@ sub run {
}
my $file = @{$files}[0];
my @fields = qw/wormbase_gene wormbase_gseqname wormbase_locus wormbase_transcript wormpep_id protein_id/;
my @fields = qw/wormbase_gene wormbase_gseqname wormbase_locus wormbase_transcript wormbase_cds wormpep_id protein_id/;
my %src_ids;
my $sth = $self->dbi()->prepare("SELECT xref_id FROM xref WHERE accession=? AND source_id=? AND species_id=$species_id");
for my $field (@fields){
......@@ -65,6 +65,10 @@ sub run {
$sth, $species_id, "transcript", $src_ids{wormbase_transcript},
$transcript->{transcript_id}, $transcript->{transcript_id}
);
$self->add_xref_and_direct_xref(
$sth, $species_id, "transcript", $src_ids{wormbase_cds},
$transcript->{wormbase_cds}, $transcript->{wormbase_cds}, $transcript->{transcript_id}
);
$self->add_xref_and_direct_xref(
$sth, $species_id, "translation", $src_ids{wormpep_id},
$transcript->{wormpep_id}, $transcript->{wormpep_id}, $transcript->{transcript_id}
......@@ -90,11 +94,13 @@ sub get_data {
while ( $_ = $pep_io->getline() ) {
next if /^\/\//;
my ($gseqid, $wbgeneid, $locus, $wbtranscript, $wormpep, $insdc_parent, $insdc_locus_tag, $protein_id, $uniprot_id) = split(/\t/, $_);
$data->{$wbgeneid}->{transcripts} //=[];
push @{$data->{$wbgeneid}->{transcripts}}, {
transcript_id => $wbtranscript,
($wormpep eq '.' ? () : (wormpep_id => $wormpep)),
($protein_id eq '.' ? () : (protein_id => $protein_id)),
($wormpep ne '.' && $wbtranscript =~ /^(.*?)(\.\d+)?$/ ? (wormbase_cds => $1 ) : ()),
($wormpep ne '.' ? (wormpep_id => $wormpep) : ()),
($protein_id ne '.' ? (protein_id => $protein_id) : ()),
};
$data->{$wbgeneid}->{wormbase_gseqname} = $gseqid;
$data->{$wbgeneid}->{wormbase_locus} = $locus if $locus ne '.';
......
......@@ -1652,6 +1652,14 @@ order = 50
priority = 1
parser = comes from WormbaseDirectParser
[source wormbase_cds::wormbase]
# Used by wormbase core species
name = wormbase_cds
download = N
order = 50
priority = 1
parser = comes from WormbaseDirectParser
[source Gramene_Pathway::arabidopsis_thaliana]
# Used by Arabidopsis thaliana, Gramene-specific
name = Gramene_Pathway
......
......@@ -69,7 +69,7 @@ sub test_parser {
my ($parser, $content, $expected, $test_name, %opts) = @_;
require_ok($parser);
$parser->new($database)->run({
files => [store_in_temporary_file($content, %opts)],
files =>[store_in_temporary_file($content, %opts)],
source_id => "Source id (unused but sometimes required)",
species_id => $SPECIES_ID,
species => $SPECIES_NAME,
......@@ -104,17 +104,18 @@ my $wormbase_celegans_xrefs_head= <<EOF;
// Missing or not applicable data (e.g. protein identifiers for non-coding RNAs) is denoted by a "."
//
2L52.1 WBGene00007063 . 2L52.1b CE50569 BX284602 CELE_2L52.1 CTQ86426 A0A0K3AWR5
2L52.1 WBGene00007063 . 2L52.1a CE32090 BX284602 CELE_2L52.1 CCD61130 A4F336
2L52.1 WBGene00007063 . 2L52.1a.1 CE32090 BX284602 CELE_2L52.1 CCD61130 A4F336
2L52.1 WBGene00007063 . 2L52.1a.2 CE32090 BX284602 CELE_2L52.1 CCD61130 A4F336
2L52.2 WBGene00200402 . 2L52.2 . BX284602 CELE_2L52.2 . .
EOF
my $wormbase_celegans_xrefs_expected_count = {
xref=>11,
xref=>14,
gene_direct_xref => 4,
transcript_direct_xref => 3,
translation_direct_xref => 4,
transcript_direct_xref => 7,
translation_direct_xref => 6,
};
test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,
$wormbase_celegans_xrefs_expected_count, "Direct xrefs: genes: columns 1,2,3, transcripts: column 4, translations: column 5 and 7. xrefs: sum of these "
$wormbase_celegans_xrefs_expected_count, "Direct xrefs: genes: columns 1,2,3, transcripts: column 4 once as transcript and also once as CDS for coding genes, translations: column 5 and 7. xrefs: sum of these minus one reused CDS and two reused proteins"
);
my $uniprot_elegans_record = <<EOF;
......@@ -185,32 +186,47 @@ for my $l (@recognised_sources) {
primary_xref => 1,
dependent_xref => 3,
}, "Pick up as extra xref + dependent xref: $l" );
}
test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record, {
}, "No UniProt entries without corresponding INSDC entries");
test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,
$wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
skip_clean => 1);
}
sub test_elegans_uniprot {
my ($expected_count, $extra_line) = @_;
my $uniprot_elegans_record_here = $uniprot_elegans_record;
$uniprot_elegans_record_here =~ s/DR(.*?)\n/DR$1\nDR $extra_line/ if $extra_line;
test_parser("XrefParser::WormbaseCElegansUniProtParser",
$uniprot_elegans_record_here,
{},
"No UniProt entries without corresponding INSDC entries $extra_line"
);
test_parser("XrefParser::WormbaseDirectParser",
$wormbase_celegans_xrefs_head,
$wormbase_celegans_xrefs_expected_count,
"Test again to set up the next test",
skip_clean => 1,
);
test_parser("XrefParser::WormbaseCElegansUniProtParser",
$uniprot_elegans_record_here,
$expected_count,
"Correct xrefs and dependent xrefs $extra_line",
);
}
my $wormbase_and_uniprot_expected_count = {
%$wormbase_celegans_xrefs_expected_count,
xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
dependent_xref => 1 #protein id still there, no parent sequence ID
xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
dependent_xref => 1 #protein id still there, no parent sequence ID
};
test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record,
$wormbase_and_uniprot_expected_count, "Get counts");
test_elegans_uniprot($wormbase_and_uniprot_expected_count, "");
for my $l (@recognised_sources) {
(my $uniprot_elegans_record_extra_line = $uniprot_elegans_record) =~ s/DR(.*?)\n/DR$1\nDR $l/;
test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,
$wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
skip_clean => 1);
test_parser("XrefParser::WormbaseCElegansUniProtParser", $uniprot_elegans_record_extra_line, {
test_elegans_uniprot({
%$wormbase_and_uniprot_expected_count,
xref => $wormbase_and_uniprot_expected_count->{xref}+1,
dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1,
}, "Pick up as extra xref + dependent xref: $l" );
},$l);
}
test_elegans_uniprot({
%$wormbase_and_uniprot_expected_count,
dependent_xref => $wormbase_and_uniprot_expected_count->{dependent_xref}+1,
}, "EMBL; BX284602; CCD61130.1; -; Genomic_DNA",
);
my $refseq_protein_elegans_record = <<EOF;
LOCUS NP_493629 427 aa linear INV 19-AUG-2018
DEFINITION Uncharacterized protein CELE_2L52.1 [Caenorhabditis elegans].
......@@ -261,21 +277,91 @@ ORIGIN
421 fdveigy
//
EOF
test_parser("XrefParser::RefSeqGPFFParser",$refseq_protein_elegans_record, {
xref =>1,
primary_xref => 1,
}, "Example RefSeq protein record" , tmp_file_name => "something_that_says_protein");
test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record, {
}, "No entries without WormBase records" , tmp_file_name => "something_that_says_protein");
test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,
$wormbase_celegans_xrefs_expected_count, "Test again to set up the next test",
skip_clean => 1);
test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$refseq_protein_elegans_record, {
%$wormbase_celegans_xrefs_expected_count,
xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
dependent_xref => 1,
}, "RefSeq entries hang off INSDC entries", tmp_file_name => "something_that_says_protein");
my $refseq_mrna_elegans_record = <<EOF;
LOCUS NM_001313558 663 bp mRNA linear INV 19-AUG-2018
DEFINITION Caenorhabditis elegans Uncharacterized protein (2L52.1), partial
mRNA.
ACCESSION NM_001313558
VERSION NM_001313558.1
DBLINK BioProject: PRJNA158
BioSample: SAMEA3138177
KEYWORDS RefSeq.
SOURCE Caenorhabditis elegans
ORGANISM Caenorhabditis elegans
Eukaryota; Metazoa; Ecdysozoa; Nematoda; Chromadorea; Rhabditida;
Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis.
REFERENCE 1 (bases 1 to 663)
<snipped>
COMMENT REVIEWED REFSEQ: This record has been curated by WormBase. This
record is derived from an annotated genomic sequence (NC_003280).
COMPLETENESS: incomplete on both ends.
FEATURES Location/Qualifiers
source 1..663
/organism="Caenorhabditis elegans"
/mol_type="mRNA"
/strain="Bristol N2"
/db_xref="taxon:$SPECIES_ID"
/chromosome="II"
gene <1..>663
/gene="2L52.1"
/locus_tag="CELE_2L52.1"
/db_xref="GeneID:181792"
/db_xref="WormBase:WBGene00007063"
CDS 1..663
/gene="2L52.1"
/locus_tag="CELE_2L52.1"
/standard_name="2L52.1a"
/note="Confirmed by transcript evidence"
/codon_start=1
/product="hypothetical protein"
/protein_id="NP_001300487.1"
/db_xref="EnsemblGenomes-Gn:WBGene00007063"
/db_xref="EnsemblGenomes-Tr:2L52.1b"
/db_xref="GeneID:181792"
/db_xref="UniProtKB/TrEMBL:A0A0K3AWR5"
/db_xref="WormBase:WBGene00007063"
/translation="MSDNEEVYVNFRGMNCISTGKSASMVPSKRRNWPKRVKKRLSTQ
RNNQKTIRPPELNKNNIEIKDMNSNNLEERNREECIQPVSVEKNILHFEKFKSNQICI
VRENNKFREGTRRRRKNSGESEDLKIHENFTEKRRPIRSCKQNISFYEMDGDIEEFEV
FFDTPTKSKKVLLDIYSAKKMPKIEVEDSLVNKFHSKRPSRACRVLGSMEEVPFDVEI
GY"
ORIGIN
1 atgtcagata atgaagaagt atatgtgaac ttccgtggaa tgaactgtat ctcaacagga
61 aagtcggcca gtatggtccc gagcaaacga agaaattggc caaaaagagt gaagaaaagg
121 ctatcgacac aaagaaacaa tcagaaaact attcgaccac cagagctgaa taaaaataat
181 atagagataa aagatatgaa ctcaaataac cttgaagaac gcaacagaga agaatgcatt
241 cagcctgttt ctgttgaaaa gaacatcctg cattttgaaa aattcaaatc aaatcaaatt
301 tgcattgttc gggaaaacaa taaatttaga gaaggaacga gaagacgcag aaagaattct
361 ggtgaatcgg aagacttgaa aattcatgaa aactttactg aaaaacgaag acccattcga
421 tcatgcaaac aaaatataag tttctatgaa atggacgggg atatagaaga atttgaagtg
481 tttttcgata ctcccacaaa aagcaaaaaa gtacttctgg atatctacag tgcgaagaaa
541 atgccaaaaa ttgaggttga agattcatta gttaataagt ttcattcaaa acgtccatca
601 agagcatgtc gagttcttgg aagtatggaa gaagtaccat ttgatgtgga aataggatat
661 tga
//
EOF
sub test_refseq {
my ($record, $type) = @_;
my $file = "something_that_says_$type";
test_parser("XrefParser::RefSeqGPFFParser",$record, {
xref =>1,
primary_xref => 1,
}, "$type: Example record" , tmp_file_name => $file);
test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$record, {
}, "$type no entries without WormBase records" , tmp_file_name => $file);
test_parser("XrefParser::WormbaseDirectParser", $wormbase_celegans_xrefs_head,
$wormbase_celegans_xrefs_expected_count, "$type test again to set up the next test",
skip_clean => 1);
test_parser("XrefParser::WormbaseCElegansRefSeqGPFFParser",$record, {
%$wormbase_celegans_xrefs_expected_count,
xref => $wormbase_celegans_xrefs_expected_count->{xref}+1,
dependent_xref => 1,
}, "$type RefSeq entries hang off INSDC entries", tmp_file_name => $file);
}
test_refseq($refseq_protein_elegans_record, "protein");
test_refseq($refseq_mrna_elegans_record, "mrna");
done_testing();
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment