Commit b0c75fee authored by Hermann Zellner's avatar Hermann Zellner

Merge branch 'TRM-23844_MultipleXrefsPerProtein' into 'master'

Trm 23844 multiple xrefs per protein

See merge request uniprot-public/unifire!5
parents ed7aa4e8 08891b95
......@@ -16,8 +16,8 @@
package uk.ac.ebi.uniprot.urml.input.parsers.interpro.xml;
import uk.ac.ebi.interpro.scan.model.*;
import uk.ac.ebi.interpro.scan.model.Protein;
import uk.ac.ebi.interpro.scan.model.*;
import uk.ac.ebi.uniprot.urml.input.parsers.fasta.header.FastaHeaderData;
import uk.ac.ebi.uniprot.urml.input.parsers.fasta.header.FastaHeaderParser;
......@@ -25,8 +25,8 @@ import java.util.*;
import org.apache.commons.collections.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.uniprot.urml.facts.*;
import org.uniprot.urml.facts.Signature;
import org.uniprot.urml.facts.*;
/**
* Iterates over {@link Protein} and convert them to {@link org.uniprot.urml.facts.FactSet}.
......@@ -35,63 +35,67 @@ import org.uniprot.urml.facts.Signature;
*/
public class InterProXmlProteinConverter implements Iterator<FactSet>{
private final static Logger logger = LoggerFactory.getLogger(InterProXmlProteinConverter.class);
private static final Logger logger = LoggerFactory.getLogger(InterProXmlProteinConverter.class);
private final Iterator<Protein> sourceIterator;
private final Map<String, Organism> organismMap;
private final FastaHeaderParser uniProtFastaHeaderParser;
private final Queue<FactSet> factSetQueue;
public InterProXmlProteinConverter(ProteinMatchesHolder proteinMatches){
this.sourceIterator = proteinMatches.getProteins().iterator();
this.organismMap = new HashMap<>();
this.uniProtFastaHeaderParser = new FastaHeaderParser();
this.factSetQueue = new LinkedList<>();
}
@Override
public boolean hasNext() {
return sourceIterator.hasNext();
return !factSetQueue.isEmpty() || sourceIterator.hasNext();
}
@Override
public FactSet next() {
if (!sourceIterator.hasNext()){
if (factSetQueue.isEmpty() && !sourceIterator.hasNext()){
throw new NoSuchElementException();
} else {
return convertProteinMatches(sourceIterator.next());
if (factSetQueue.isEmpty()) {
convertProteinMatches(sourceIterator.next());
}
return factSetQueue.poll();
}
}
private FactSet convertProteinMatches(Protein ipsProtein){
FactSet.Builder<Void> factSetBuilder = FactSet.builder();
org.uniprot.urml.facts.Protein protein;
org.uniprot.urml.facts.Protein.Builder proteinBuilder = org.uniprot.urml.facts.Protein.builder();
if (ipsProtein.getCrossReferences().size() > 0) {
private void convertProteinMatches(Protein ipsProtein){
if (!ipsProtein.getCrossReferences().isEmpty()) {
Iterator<ProteinXref> xrefIterator = ipsProtein.getCrossReferences().iterator();
ProteinXref proteinXref = xrefIterator.next();
while (xrefIterator.hasNext()) {
ProteinXref proteinXref = xrefIterator.next();
FastaHeaderData fastaHeaderData = uniProtFastaHeaderParser.parse(proteinXref.getName());
FactSet.Builder<Void> factSetBuilder = FactSet.builder();
FastaHeaderData fastaHeaderData = uniProtFastaHeaderParser.parse(proteinXref.getName());
org.uniprot.urml.facts.Protein.Builder<Void> proteinBuilder = org.uniprot.urml.facts.Protein.builder();
buildProtein(proteinBuilder, fastaHeaderData, ipsProtein);
buildGeneInformation(proteinBuilder, fastaHeaderData);
Organism organism = buildOrganism(factSetBuilder, fastaHeaderData);
buildProtein(proteinBuilder, fastaHeaderData, ipsProtein);
buildGeneInformation(proteinBuilder, fastaHeaderData);
Organism organism = buildOrganism(factSetBuilder, fastaHeaderData);
proteinBuilder.withOrganism(organism);
org.uniprot.urml.facts.Protein protein = proteinBuilder.build();
factSetBuilder.addFact(protein);
proteinBuilder.withOrganism(organism);
protein = proteinBuilder.build();
factSetBuilder.addFact(protein);
for (Match match : ipsProtein.getMatches()) {
buildProteinSignature(factSetBuilder, protein, match);
}
if (xrefIterator.hasNext()) {
logger.warn("Unexpected: more than one xref for ipsProtein {}", proteinXref.getIdentifier());
factSetQueue.add(factSetBuilder.build());
}
} else {
throw new InterProScanXmlFormatException(
String.format("Missing xref tag for ipsProtein md5=%s", ipsProtein.getMd5()));
}
for (Match match : ipsProtein.getMatches()) {
buildProteinSignature(factSetBuilder, protein, match);
}
return factSetBuilder.build();
}
private void buildProteinSignature(FactSet.Builder<Void> factSetBuilder, org.uniprot.urml.facts.Protein protein,
......
......@@ -17,13 +17,16 @@
package uk.ac.ebi.uniprot.urml.input.parsers.interpro.xml;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.uniprot.urml.facts.*;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.core.Is.is;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
......@@ -38,12 +41,12 @@ public class InterProXmlProteinParserTest {
private static final String BASE_PATH = "/samples/interpro.xml/";
@Test
public void parse() throws Exception {
public void parseOneProteinOneXref() throws Exception {
InputStream interproXmlIS = getClass().getResourceAsStream(BASE_PATH+"one_protein_matches.xml");
InterProXmlProteinParser interProXmlProteinParser = new InterProXmlProteinParser();
Iterator<FactSet> parsedFactSet = interProXmlProteinParser.parse(interproXmlIS);
new ConvertedInterProDataChecker() {
ConvertedInterProDataChecker checker = new ConvertedInterProDataChecker() {
protected int expectedNumberOfProteins() {
return 1;
}
......@@ -81,40 +84,121 @@ public class InterProXmlProteinParserTest {
}
}
}.check(parsedFactSet);
};
int proteinCounter = 0;
while (parsedFactSet.hasNext()) {
FactSet factSet = parsedFactSet.next();
for (Fact fact : factSet.getFact()) {
if (fact instanceof Protein) {
++proteinCounter;
}
}
checker.check(factSet);
}
assertThat(proteinCounter, is(1));
}
@Test
public void parseOneProteinTwoXrefs() throws Exception {
InputStream interproXmlIS = getClass().getResourceAsStream(BASE_PATH+"one_protein_two_xrefs.xml");
InterProXmlProteinParser interProXmlProteinParser = new InterProXmlProteinParser();
Iterator<FactSet> parsedFactSet = interProXmlProteinParser.parse(interproXmlIS);
ConvertedInterProDataChecker checker = new ConvertedInterProDataChecker() {
private List<String> expectedProteinNames = Arrays.asList(
// We have 4 signature matches for each protein, so the protein name is checked 5 times: once for
// ProteinFact and 4 times for ProteinSignatures
"A0A6B9WDM2","A0A6B9WDM2","A0A6B9WDM2","A0A6B9WDM2","A0A6B9WDM2",
"A0A6C0TBF5", "A0A6C0TBF5", "A0A6C0TBF5", "A0A6C0TBF5", "A0A6C0TBF5");
protected int expectedNumberOfProteins() {
return 1;
}
protected int expectedNumberOfProteinSignatures() {
return 4; // 2 libraries + 2 IPR equivalents
}
protected void checkProtein(Protein protein) {
assertThat(protein.getId(), equalTo(expectedProteinNames.get(proteinCounter)));
assertThat(protein.getSequence().getLength(), equalTo(222));
assertThat(protein.getSequence().getValue().length(), equalTo(222));
assertFalse(protein.getSequence().getIsFragment());
}
protected void checkOrganism(Organism organism) {
assertTrue(organism.isSetId());
assertThat(organism.getLineage().getIds(), containsInAnyOrder(1,10239,2559587,76804,2499399,11118,2501931,694002,2509511,694009,2697049));
assertThat(organism.getScientificName(), equalTo("Severe acute respiratory syndrome coronavirus 2"));
}
protected void checkProteinSignature(ProteinSignature proteinSignature) {
assertTrue(proteinSignature.isSetSignature() && proteinSignature.getSignature().isSetValue());
switch (proteinSignature.getSignature().getValue()){
case "PF01635":
checkInterProSignatureType(proteinSignature, SignatureType.PFAM);
assertThat(proteinSignature.getFrequency(), equalTo(1));
break;
case "IPR002574":
checkInterProSignatureType(proteinSignature, SignatureType.PFAM);
checkInterProSignatureType(proteinSignature, SignatureType.HAMAP);
assertThat(proteinSignature.getFrequency(), equalTo(1));
break;
case "MF_04202":
checkInterProSignatureType(proteinSignature, SignatureType.HAMAP);
assertThat(proteinSignature.getFrequency(), equalTo(1));
break;
default:
fail("Unexpected protein signature"+proteinSignature);
}
}
};
int proteinCounter = 0;
while (parsedFactSet.hasNext()) {
FactSet factSet = parsedFactSet.next();
for (Fact fact : factSet.getFact()) {
if (fact instanceof Protein) {
++proteinCounter;
}
}
checker.check(factSet);
}
assertThat(proteinCounter, is(2));
}
private abstract class ConvertedInterProDataChecker {
private int proteinCounter = 0;
protected int proteinCounter = 0;
private int organismCounter = 0;
private int proteinSignatureCounter = 0;
void check(Iterator<FactSet> data){
while (data.hasNext()){
FactSet factSet = data.next();
for (Fact fact : factSet.getFact()) {
if (fact instanceof Organism) {
checkOrganism((Organism) fact);
organismCounter++;
} else if (fact instanceof ProteinSignature) {
ProteinSignature proteinSignature = (ProteinSignature) fact;
checkProteinSignature(proteinSignature);
checkProtein((Protein) proteinSignature.getProtein());
proteinSignatureCounter++;
} else if (fact instanceof Protein){
Protein protein = (Protein) fact;
checkProtein(protein);
checkOrganism(protein.getOrganism());
proteinCounter++;
} else {
fail("Unexpected fact type: "+fact);
}
void check(FactSet factSet){
organismCounter = 0;
proteinSignatureCounter = 0;
for (Fact fact : factSet.getFact()) {
if (fact instanceof Organism) {
checkOrganism((Organism) fact);
organismCounter++;
} else if (fact instanceof ProteinSignature) {
ProteinSignature proteinSignature = (ProteinSignature) fact;
checkProteinSignature(proteinSignature);
checkProtein((Protein) proteinSignature.getProtein());
proteinSignatureCounter++;
proteinCounter++;
} else if (fact instanceof Protein){
Protein protein = (Protein) fact;
checkProtein(protein);
checkOrganism(protein.getOrganism());
proteinCounter++;
} else {
fail("Unexpected fact type: "+fact);
}
assertThat(organismCounter, equalTo(1));
assertThat(proteinSignatureCounter, equalTo(expectedNumberOfProteinSignatures()));
}
assertThat(proteinCounter, equalTo(expectedNumberOfProteins()));
assertThat(organismCounter, equalTo(organismCounter));
assertThat(proteinSignatureCounter, equalTo(expectedNumberOfProteinSignatures()));
}
protected abstract int expectedNumberOfProteins();
......
<?xml version="1.0" encoding="UTF-8"?><protein-matches xmlns="http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5" interProScanVersion="5.41-78.0">
<protein>
<sequence md5="1cd6abff79ad3633e17582eb0e576539">MADSNGTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIFLWLLWPVTLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRLFARTRSMWSFNPETNILLNVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCDIKDLPKEITVATSRTLSYYKLGASQRVAGDSGFAAYSRYRIGNYKLNTDHSSSSDNIALLVQ</sequence>
<xref id="tr|A0A6B9WDM2|A0A6B9WDM2_SARS2" name="tr|A0A6B9WDM2|A0A6B9WDM2_SARS2 Membrane protein OS=Severe acute respiratory syndrome coronavirus 2 OX=1,10239,2559587,76804,2499399,11118,2501931,694002,2509511,694009,2697049 GN=M PE=2 SV=1"/>
<xref id="tr|A0A6C0TBF5|A0A6C0TBF5_SARS2" name="tr|A0A6C0TBF5|A0A6C0TBF5_SARS2 Membrane glycoprotein OS=Severe acute respiratory syndrome coronavirus 2 OX=1,10239,2559587,76804,2499399,11118,2501931,694002,2509511,694009,2697049 GN=M PE=2 SV=1"/>
<matches>
<hmmer3-match evalue="1.3E-95" score="319.0">
<signature ac="PF01635" desc="Coronavirus M matrix/glycoprotein" name="Corona_M">
<entry ac="IPR002574" desc="Coronavirus M matrix/glycoprotein" name="Corona_M" type="FAMILY">
<go-xref category="BIOLOGICAL_PROCESS" db="GO" id="GO:0019058" name="viral life cycle"/>
</entry>
<signature-library-release library="PFAM" version="32.0"/>
</signature>
<model-ac>PF01635</model-ac>
<locations>
<hmmer3-location env-end="221" env-start="4" post-processed="true" score="318.8" evalue="1.4E-95" hmm-start="1" hmm-end="220" hmm-length="220" hmm-bounds="COMPLETE" start="4" end="221">
<location-fragments>
<hmmer3-location-fragment start="4" end="221" dc-status="CONTINUOUS"/>
</location-fragments>
</hmmer3-location>
</locations>
</hmmer3-match>
<profilescan-match>
<signature ac="MF_04202" desc="Membrane protein [M]." name="BETA_CORONA_M">
<entry ac="IPR002574" desc="Coronavirus M matrix/glycoprotein" name="Corona_M" type="FAMILY">
<go-xref category="BIOLOGICAL_PROCESS" db="GO" id="GO:0019058" name="viral life cycle"/>
</entry>
<signature-library-release library="HAMAP" version="2020_01"/>
</signature>
<model-ac>MF_04202</model-ac>
<locations>
<profilescan-location score="43.942" start="1" end="222">
<location-fragments>
<profilescan-location-fragment start="1" end="222" dc-status="CONTINUOUS"/>
</location-fragments>
<alignment>MADSNgTITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIFLWLLWPVTLACFVLAAVYRINWITGGIAIAMACLVGLMWLSYFIASFRLFARTRSMWSFNPETNILLNVPLHGTILTRPLLESELVIGAVILRGHLRIAGHHLGRCDIKDLPKEITVATSRTLSYYKLGASQRVAGDSGFAAYSRYRIGNYKLNTdHSSSSDNIALLVQ</alignment>
</profilescan-location>
</locations>
</profilescan-match>
</matches>
</protein>
</protein-matches>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment