Commit 9756c1da authored by Hermann Zellner's avatar Hermann Zellner

Merge branch 'hz-SHMM-condition' into 'master'

Integrating PIRSR into UniFIRE

See merge request uniprot-public/unifire!2
parents 4509d76a bb5f3905
......@@ -55,6 +55,14 @@ $ ./distribution/bin/unifire.sh -r samples/unirule-urml-latest.xml -i samples/in
$ ./distribution/bin/unifire.sh -r samples/saas-urml-latest.xml -i samples/input_facts.xml -s XML -o ~/output_saas_annotations.csv
```
**Example with PIRSR rules & InterProScan XML input:**
``` bash
$ ./distribution/bin/pirsr.sh -i ./samples/pirsr_data/PIRSR-input-iprscan.xml -o ~/ -a <path-to-hmmalign-command> -d ./samples/pirsr_data
```
``` bash
$ ./distribution/bin/unifire.sh -r samples/unirule.pirsr-latest.xml -i ~/PIRSR-input-iprscan-urml.xml -s XML -t samples/pirsr_data/PIRSR_templates.xml -o ~/pirsr_unifire_annotation.csv
```
_Note_: With all rule systems, it is possible that a protein get the exact same annotation from different rules due to overlap in condition spaces.
#### Options
......@@ -292,4 +300,4 @@ Please do not hesitate to raise new issues if you experience any bugs or you hav
## Contact
* **UniProt Help** - [help@uniprot.org](mailto:help@uniprot.org)
\ No newline at end of file
* **UniProt Help** - [help@uniprot.org](mailto:help@uniprot.org)
......@@ -16,6 +16,7 @@
# limitations under the License.
############################################################################
function backup_file {
filename=$1
if [[ -e ${filename} ]]; then
......@@ -42,9 +43,14 @@ popd > /dev/null
echo "Downloading rule urml files..."
for file in saas-urml-latest.xml unirule-urml-latest.xml unirule-templates-latest.xml unirule.pirsr-latest.xml;
do
backup_file ${SCRIPT_DIR}/samples/${i}
backup_file ${SCRIPT_DIR}/samples/${file}
wget ${FTP_SRC}/${file} -O ${SCRIPT_DIR}/samples/${file}
done
echo "Done downloading rule urml files."
PIRSR_DATA_SRC="https://proteininformationresource.org/pirsr/pirsr_data_latest.tar.gz"
echo "Download pirsr data files..."
backup_file ${SCRIPT_DIR}/samples/pirsr_data_latest.tar.gz
wget ${PIRSR_DATA_SRC} -O ${SCRIPT_DIR}/samples/pirsr_data_latest.tar.gz
tar zxvf ${SCRIPT_DIR}/samples/pirsr_data_latest.tar.gz -C ${SCRIPT_DIR}/samples/
echo "Done download pirsr data files."
package uk.ac.ebi.uniprot.urml.core.xml.writers;
import com.sun.xml.txw2.output.IndentingXMLStreamWriter;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import uk.ac.ebi.uniprot.urml.core.xml.schema.JAXBContextInitializationException;
import com.sun.xml.txw2.output.IndentingXMLStreamWriter;
import java.io.OutputStream;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import java.io.OutputStream;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import static javax.xml.stream.XMLOutputFactory.newFactory;
......
......@@ -16,15 +16,15 @@
package uk.ac.ebi.uniprot.urml.core.xml.writers;
import com.sun.xml.txw2.output.IndentingXMLStreamWriter;
import org.apache.commons.lang3.ArrayUtils;
import org.uniprot.urml.facts.Fact;
import org.uniprot.urml.facts.FactSet;
import org.uniprot.urml.facts.ObjectFactory;
import org.w3c.dom.Document;
import uk.ac.ebi.uniprot.urml.core.xml.schema.URMLConstants;
import uk.ac.ebi.uniprot.urml.core.xml.schema.mappers.FactNamespaceMapper;
import com.sun.xml.txw2.output.IndentingXMLStreamWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import javax.xml.XMLConstants;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
......@@ -35,11 +35,11 @@ import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import org.apache.commons.lang3.ArrayUtils;
import org.uniprot.urml.facts.Fact;
import org.uniprot.urml.facts.FactSet;
import org.uniprot.urml.facts.ObjectFactory;
import org.w3c.dom.Document;
import static javax.xml.stream.XMLOutputFactory.newFactory;
import static uk.ac.ebi.uniprot.urml.core.xml.schema.URMLConstants.URML_FACT_NAMESPACE;
......
#!/usr/bin/env bash
############################################################################
# Copyright (c) 2018 European Molecular Biology Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
############################################################################
get_script_dir () {
SOURCE="${BASH_SOURCE[0]}"
while [ -h "$SOURCE" ]; do
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
SOURCE="$( readlink "$SOURCE" )"
[[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
done
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
echo "$DIR"
}
# Environment
JAVA_VERSION_MIN=1.8
SCRIPT_DIR=$(get_script_dir)
# Default values
DEFAULT_MAX_MEMORY=4096
function run {
local cmdArgs="${@}"
local memory=$(echo $cmdArgs | grep -P "\-m(\s+)?\d+" | sed -E 's/.*-m *([0-9]+).*/\1/g')
java -Xmx"${memory:-$DEFAULT_MAX_MEMORY}M" -cp "${SCRIPT_DIR}/../target/*:${SCRIPT_DIR}/../target/dependency/*" org.proteininformationresource.pirsr.PIRSRApp ${cmdArgs}
}
function checkEnv {
local VALID_JAVA_VERSION=$(java -version 2>&1 | head -n 1 | sed 's/^.*"\([0-9]\.[0-9]\)\..*"$/\1/g' | awk -v v=${JAVA_VERSION_MIN} '{print ($1 >= v)}')
if [ ${VALID_JAVA_VERSION} -eq 0 ]
then
>&2 echo "Java version must be >=$JAVA_VERSION_MIN."
exit 1
fi
}
function main {
checkEnv
run "${@}"
}
main "${@}"
package org.proteininformationresource.pirsr;
/**
* A class for PIRSR Info
*
* @author Chuming Chen
*
*/
public class PIRSR {
private String ruleAC;
private String trigger;
private String srhmmAC;
private String templateAC;
private String templateSeq;
/**
* @param ruleAC
* @param trigger
* @param srhmmAC
* @param templateAC
* @param templateSeq
*/
public PIRSR(String ruleAC, String trigger, String srhmmAC, String templateAC, String templateSeq) {
super();
this.ruleAC = ruleAC;
this.trigger = trigger;
this.srhmmAC = srhmmAC;
this.templateAC = templateAC;
this.templateSeq = templateSeq;
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return "PIRSRInfo [ruleAC=" + ruleAC + ", trigger=" + trigger + ", srhmmAC=" + srhmmAC + ", templateAC=" + templateAC + ", templateSeq=" + templateSeq
+ "]";
}
/**
* @return the ruleAC
*/
public String getRuleAC() {
return ruleAC;
}
/**
* @param ruleAC the ruleAC to set
*/
public void setRuleAC(String ruleAC) {
this.ruleAC = ruleAC;
}
/**
* @return the trigger
*/
public String getTrigger() {
return trigger;
}
/**
* @param trigger the trigger to set
*/
public void setTrigger(String trigger) {
this.trigger = trigger;
}
/**
* @return the srhmmAC
*/
public String getSrhmmAC() {
return srhmmAC;
}
/**
* @param srhmmAC the srhmmAC to set
*/
public void setSrhmmAC(String srhmmAC) {
this.srhmmAC = srhmmAC;
}
/**
* @return the templateAC
*/
public String getTemplateAC() {
return templateAC;
}
/**
* @param templateAC the templateAC to set
*/
public void setTemplateAC(String templateAC) {
this.templateAC = templateAC;
}
/**
* @return the templateSeq
*/
public String getTemplateSeq() {
return templateSeq;
}
/**
* @param templateSeq the templateSeq to set
*/
public void setTemplateSeq(String templateSeq) {
this.templateSeq = templateSeq;
}
}
package org.proteininformationresource.pirsr;
import static com.google.common.primitives.Booleans.trueFirst;
import static java.lang.System.exit;
import java.io.File;
import java.util.Comparator;
import java.util.function.Function;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.MissingArgumentException;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.drools.core.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Strings;
import uk.ac.ebi.uniprot.urml.input.InputType;
/**
* Entry point for the PIRSR application
*
* @author Chuming Chen
*
*/
public class PIRSRApp {
private static final Logger logger = LoggerFactory.getLogger(PIRSRApp.class);
private static final InputType DEFAULT_INPUT_TYPE = InputType.INTERPROSCAN_XML;
private static final String USAGE_SEPARATOR = StringUtils.repeat("-", 100);
public static void main(String[] args) throws Exception {
Options options = new Options();
Option inputFileOption = Option.builder("i").longOpt("input_file").hasArg().argName("INPUT_FILE")
.desc("Input file (path) containing the proteins to annotate and required data in " + DEFAULT_INPUT_TYPE.getDescription() + " format.")
.type(File.class).required().build();
Option pirsrDataDirOption = Option.builder("d").longOpt("pirsr_data_dir").hasArg().argName("PIRSR_DATA_DIR").desc("Directory for PIRSR data.")
.type(File.class).required().build();
Option outputDirOption = Option.builder("o").longOpt("output_dir").hasArg().argName("OUTPUT_DIR")
.desc("Directory for SRHMM hmmalign result and enhanced IPRScan Facts XML file.").type(File.class).required().build();
Option hmmalignOption = Option.builder("a").longOpt("hmmalign").hasArg().argName("HMMALIGN").desc("Path to hmmalign command.").type(File.class)
.required().build();
Option helpOption = Option.builder("h").longOpt("help").desc("Print this usage.").build();
options.addOption(pirsrDataDirOption);
options.addOption(inputFileOption);
options.addOption(hmmalignOption);
options.addOption(outputDirOption);
options.addOption(helpOption);
if (hasHelp(helpOption, args)) {
displayUsage(options);
exit(0);
}
PIRSRRunner pirsrRunner = null;
try {
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(options, args);
File outputDirectory = parseOption(cmd, outputDirOption, File::new, null);
File pirsrDataDirectory = parseOption(cmd, pirsrDataDirOption, FileCreatorChecker::createAndCheck, null);
File inputFactFile = parseOption(cmd, inputFileOption, FileCreatorChecker::createAndCheck, null);
File hmmalignCommand = parseOption(cmd, hmmalignOption, FileCreatorChecker::createAndCheck, null);
pirsrRunner = new PIRSRRunner(pirsrDataDirectory, inputFactFile, outputDirectory, hmmalignCommand);
} catch (Exception e) {
logger.error(e.getMessage());
displayUsage(options);
exit(1);
}
pirsrRunner.run();
}
private static void displayUsage(Options options) {
HelpFormatter formatter = new HelpFormatter();
formatter.setOptionComparator(Comparator.comparing(Option::isRequired, trueFirst()).thenComparing(Option::hasArg, trueFirst())
.thenComparing(Option::hasLongOpt, trueFirst()).thenComparing(Option::getOpt));
formatter.setWidth(100);
formatter.setDescPadding(5);
formatter.setLeftPadding(5);
formatter.printHelp("pirsr", USAGE_SEPARATOR, options, USAGE_SEPARATOR, true);
}
private static boolean hasHelp(final Option help, final String[] args) throws ParseException {
Options options = new Options();
options.addOption(help);
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(options, args, true);
return cmd.hasOption(help.getOpt());
}
private static <T> T parseOption(CommandLine commandLine, Option option, Function<String, T> creator, T defaultObject)
throws MissingArgumentException {
if (commandLine.hasOption(option.getOpt())){
String optionValue = commandLine.getOptionValue(option.getOpt());
if (Strings.isNullOrEmpty(optionValue)){
throw new MissingArgumentException(option);
} else {
try {
return creator.apply(optionValue);
} catch (Exception e){
throw new IllegalArgumentException(String.format("Wrong argument for option -%s. %s",
option.getOpt(), e.getMessage()), e);
}
}
} else {
return defaultObject;
}
}
private static class FileCreatorChecker {
static File createAndCheck(String path) {
File file = new File(path);
if (file.exists()){
return file;
} else {
throw new IllegalArgumentException(String.format("No such file or directory: %s", file));
}
}
}
}
package org.proteininformationresource.pirsr;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Collections;
import javax.xml.bind.JAXBException;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.uniprot.urml.facts.Fact;
import org.uniprot.urml.facts.FactSet;
import org.uniprot.urml.facts.PositionalProteinSignature;
import org.uniprot.urml.facts.Protein;
import org.uniprot.urml.facts.ProteinSignature;
import org.uniprot.urml.facts.SequenceAlignment;
import org.uniprot.urml.facts.Signature;
import org.uniprot.urml.facts.SignatureType;
import uk.ac.ebi.uniprot.urml.core.xml.writers.URMLFactWriter;
import uk.ac.ebi.uniprot.urml.input.InputType;
import uk.ac.ebi.uniprot.urml.input.parsers.FactSetParser;
/**
* Runner to launch the PIRSR application
*
* @author Chuming Chen
*/
public class PIRSRRunner {
private static final Logger logger = LoggerFactory.getLogger(PIRSRRunner.class);
private final File pirsrDataDirectory;
private final File inputFactFile;
private final File outputDirectory;
private final File hmmalignCommand;
public PIRSRRunner(File pirsrDataDirectory, File inputFactFile, File outputDirectory, File hmmalignCommand) {
this.pirsrDataDirectory = pirsrDataDirectory;
this.inputFactFile = inputFactFile;
this.outputDirectory = outputDirectory;
this.hmmalignCommand = hmmalignCommand;
logArguments();
}
public void run() throws IOException, JAXBException {
Set<PIRSR> pirsrInfo = getPIRSRInfo(pirsrDataDirectory);
Map<String, Set<PIRSR>> pirsrTriggerMap = getTriggerMap(pirsrInfo);
logger.info("Collecting triggered proteins from InterProScan XML file...");
Map<Protein, Set<PIRSR>> triggeredProteins = getTriggerProteins(pirsrTriggerMap);
logger.info("Done collecting triggered proteins from InterProScan XML file.");
Set<PIRSR> triggeredPIRSR = createFasta(triggeredProteins);
logger.info("Running hmmalign of triggered proteins against SRHMM...");
runHMMAlign(triggeredPIRSR);
logger.info("Done running hmmalign of matched proteins against SRHMM.");
Map<Protein, Set<PositionalProteinSignature>> matchedProteinFacts = getMatchedProteinFacts(triggeredProteins);
logger.info("Adding SRHMM signatures to InterProScan XML file...");
addPositionalProteinSignature(matchedProteinFacts);
logger.info("Done adding SRHMM signatures to InterProScan XML file.");
logger.info("The enhanced InterProScan XML file is at \"" + this.outputDirectory + "/"
+ this.inputFactFile.getName().replaceAll("(?i).xml$", "-urml.xml") + "\"");
}
private void addPositionalProteinSignature(Map<Protein, Set<PositionalProteinSignature>> matchedProteinFacts) {
String outFile = this.inputFactFile.getName().replaceAll("(?i).xml$", "-urml.xml");
try {
InputStream factInputStream = new FileInputStream(this.inputFactFile);
OutputStream outputStream = new FileOutputStream(this.outputDirectory + "/" + outFile);
URMLFactWriter factWriter = new URMLFactWriter(outputStream);
FactSet updatedFactSet = new FactSet();
List<Fact> allFacts = new ArrayList<Fact>();
Iterator<FactSet> factSetIterator = FactSetParser.of(InputType.INTERPROSCAN_XML).parse(factInputStream);
while (factSetIterator.hasNext()) {
FactSet factSet = factSetIterator.next();
List<Fact> facts = new ArrayList<>(factSet.getFact());
Protein protein = null;
for (Fact fact : facts) {
if (fact instanceof Protein) {
protein = (Protein) fact;
}
}
for (PositionalProteinSignature positionalProteinSignature : matchedProteinFacts.getOrDefault(protein, Collections.emptySet())) {
facts.add(positionalProteinSignature);
}
allFacts.addAll(facts);
}
updatedFactSet.setFact(allFacts);
factWriter.write(updatedFactSet);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (JAXBException e) {
e.printStackTrace();
} catch (XMLStreamException e) {
e.printStackTrace();
}
}
private Map<Protein, Set<PIRSR>> getTriggerProteins(Map<String, Set<PIRSR>> pirsrTriggerMap) throws FileNotFoundException, IOException {
Map<Protein, Set<PIRSR>> triggeredProteins = new HashMap<Protein, Set<PIRSR>>();
InputStream factInputStream = new FileInputStream(this.inputFactFile);
Iterator<FactSet> factSetIterator = FactSetParser.of(InputType.INTERPROSCAN_XML).parse(factInputStream);
while (factSetIterator.hasNext()) {
FactSet factSet = factSetIterator.next();
List<Fact> facts = new ArrayList<>(factSet.getFact());
Protein protein = null;
for (Fact fact : facts) {
if (fact instanceof Protein) {
protein = (Protein) fact;
}
if (fact instanceof ProteinSignature) {
ProteinSignature proteinSignature = (ProteinSignature) fact;
String signature = proteinSignature.getSignature().getValue();
Set<PIRSR> pirsrList = pirsrTriggerMap.get(signature);
if (pirsrList != null) {
Set<PIRSR> triggeredPIRSRList = triggeredProteins.get(protein);
if (triggeredPIRSRList == null) {
triggeredPIRSRList = new HashSet<PIRSR>();
}
triggeredPIRSRList.addAll(pirsrList);
triggeredProteins.put(protein, triggeredPIRSRList);
}
}
}
}
return triggeredProteins;
}
private Map<Protein, Set<PositionalProteinSignature>> getMatchedProteinFacts(Map<Protein, Set<PIRSR>> triggeredProteins) {
Set<PIRSR> matchedPIRSRs = new HashSet<PIRSR>();
for (Set<PIRSR> prisrs : triggeredProteins.values()) {
matchedPIRSRs.addAll(prisrs);
}
List<Protein> matchedProteins = new ArrayList<Protein>(triggeredProteins.keySet());
Map<String, Protein> proteinMap = new HashMap<String, Protein>();
for (Protein protein : matchedProteins) {
proteinMap.put(protein.getId(), protein);
}
Map<Protein, Set<PositionalProteinSignature>> matchedProteinFacts = new HashMap<Protein, Set<PositionalProteinSignature>>();
for (PIRSR pirsr : matchedPIRSRs) {
String alignOutFile = this.outputDirectory + "/aln/" + pirsr.getRuleAC() + ".aln";
File f = new File(alignOutFile);
Map<String, String> acToAlignment = new HashMap<String, String>();
List<String> lines;
try {
lines = FileUtils.readLines(f, "UTF-8");
for (String line : lines) {
line = line.trim();
if (!(line.startsWith("#") || line.length() == 0 || line.startsWith("//"))) {
String[] aln = line.split("\\s+");
String alignment = acToAlignment.get(aln[0]);
if (alignment == null) {
alignment = "";
}
alignment += aln[1];
alignment = alignment.replaceAll("\\.", "");
acToAlignment.put(aln[0], alignment);
}
}
for (String ac : acToAlignment.keySet()) {
String srhmmAlign = acToAlignment.get(ac);
PositionalProteinSignature pps = createPositionalProteinSignature(proteinMap.get(ac), pirsr, 1, proteinMap.get(ac).getSequence().getLength(), srhmmAlign);
Set<PositionalProteinSignature> signatures = matchedProteinFacts.get(proteinMap.get(ac));
if (signatures == null) {