Commit 3e5822bf authored by Hermann Zellner's avatar Hermann Zellner

TRM-23751

- Download latest rules from ARBA instead of SAAS
- Use ARBA rules instead of SAAS rules in unifire-workflow.sh
parent 94e48ba4
# UniFIRE Project
UniFIRE (The UniProt Functional annotation Inference Rule Engine) is an engine to execute rules in the UniProt Rule
Markup Language (URML) format. It can be used to execute the UniProt annotation rules (UniRule and SAAS).
Markup Language (URML) format. It can be used to execute the UniProt annotation rules (UniRule and ARBA).
This project is a work in progress, open for collaboration.
......@@ -90,12 +90,12 @@ This is a simple example, which shows how to use the UniFIRE Docker image to run
```
This command will use as input the file samples/proteins.fasta which is in multi-FASTA format with the header in
the format as described above. It will run the whole UniFIRE workflow to predict functional annotations from UniRule
and SAAS rules. The resulting functional predictions will be written into these files in the current working
and ARBA rules. The resulting functional predictions will be written into these files in the current working
directory:
```
predictions_unirule.out
predictions_unirule-pirsr.out
predictions_saas.out
predictions_arba.out
```
### Runtime
......@@ -133,7 +133,7 @@ $ <Path to UniFIRE parent folder>/build.sh
Depending on the speed of your internet connection, it will take a few minutes to download all dependencies through
maven. You will require in total ~500 MB disk space in UniFIRE folder and in your local maven cache. The script also
downloads the latest UniRule, UniRule-PIRSR and SAAS rules in URML format and UniRule template
downloads the latest UniRule, UniRule-PIRSR and ARBA rules in URML format and UniRule template
alignments in fact XML format from EBI FTP into samples/ folder. Additionally it downloads data neccassary to run
UniRule-PIRSR rules from https://proteininformationresource.org/pirsr/pirsr_data_latest.tar.gz and places them
underneath the folder samples/pirsr_data.
......@@ -151,9 +151,10 @@ $ ./distribution/bin/unifire.sh -r samples/unirule-urml-latest.xml -i samples/in
*Note: To be able to predict the UniRule positional annotations, a template file is provided (`samples/unirule-templates-2018_05.xml`) (optional.)*
<br/>
**Example with SAAS rules & Fact XML input:**
**Example with ARBA rules & Fact XML input:**
``` bash
$ ./distribution/bin/unifire.sh -r samples/saas-urml-latest.xml -i samples/input_facts.xml -s XML -o output_saas_annotations.csv
$ ./distribution/bin/unifire.sh -r samples/arba-urml-latest.xml -i samples/input_facts.xml -s XML -o
output_arba_annotations.csv
```
<br/>
......@@ -195,7 +196,7 @@ usage: unifire -i <INPUT_FILE> -o <OUTPUT_FILE> -r <RULE_URML_FILE> [-f <OUTPUT_
-o,--output <OUTPUT_FILE> Output file (path) containing predictions in the format
specified in the -f option.
-r,--rules <RULE_URML_FILE> Rule base file (path) provided by UniProt (e.g UniRule
or SAAS) (format: URML).
or ARBA) (format: URML).
-f,--output-format <OUTPUT_FORMAT> Output file format. Supported formats are:
- TSV (Tab-Separated Values)
- XML (URML Fact XML)
......
......@@ -41,7 +41,7 @@ popd > /dev/null
echo "Downloading rule urml files..."
for file in saas-urml-latest.xml unirule-urml-latest.xml unirule-templates-latest.xml unirule.pirsr-urml-latest.xml;
for file in arba-urml-latest.xml unirule-urml-latest.xml unirule-templates-latest.xml unirule.pirsr-urml-latest.xml;
do
backup_file ${SCRIPT_DIR}/samples/${file}
wget ${FTP_SRC}/${file} -O ${SCRIPT_DIR}/samples/${file}
......
......@@ -51,7 +51,7 @@ public class UniFireApp {
Options options = new Options();
Option ruleFileOption = Option.builder("r").longOpt("rules").hasArg().argName("RULE_URML_FILE")
.desc("Rule base file (path) provided by UniProt (e.g UniRule or SAAS) (format: URML).")
.desc("Rule base file (path) provided by UniProt (e.g UniRule or ARBA) (format: URML).")
.type(File.class).required().build();
Option inputFileOption = Option.builder("i").longOpt("input").hasArg().argName("INPUT_FILE")
.desc("Input file (path) containing the proteins to annotate and required data, in the format specified by the -s option.")
......
......@@ -40,7 +40,7 @@ outdir=""
workdir=""
cleanworkdir=0
docker_version="2020.3"
predictionfiles="predictions_unirule.out predictions_saas.out predictions_unirule-pirsr.out"
predictionfiles="predictions_unirule.out predictions_arba.out predictions_unirule-pirsr.out"
while getopts "i:o:w:c:v:" optionName
do
......
......@@ -25,13 +25,13 @@ DOWNLOAD_FOLDER="/opt/download"
mkdir -p ${DOWNLOAD_FOLDER}
cd ${DOWNLOAD_FOLDER}
echo "Downloading InterProScan..."
wget -q ftp://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.44-79.0/interproscan-5.44-79.0-64-bit.tar.gz
wget -q ftp://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.45-80.0/interproscan-5.45-80.0-64-bit.tar.gz
echo "Done."
wget -q ftp://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.44-79.0/interproscan-5.44-79.0-64-bit.tar.gz.md5
wget -q ftp://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.45-80.0/interproscan-5.45-80.0-64-bit.tar.gz.md5
ipr_check=`md5sum -c interproscan-5.44-79.0-64-bit.tar.gz.md5`
ipr_check=`md5sum -c interproscan-5.45-80.0-64-bit.tar.gz.md5`
if [[ ${ipr_check} != "interproscan-5.44-79.0-64-bit.tar.gz: OK" ]]
if [[ ${ipr_check} != "interproscan-5.45-80.0-64-bit.tar.gz: OK" ]]
then
exit 11
fi
......@@ -39,7 +39,7 @@ fi
mkdir -p ${ROOT_FOLDER}
cd ${ROOT_FOLDER}
echo "Extracting InterProScan..."
tar -pxzf ${DOWNLOAD_FOLDER}/interproscan-5.44-79.0-64-bit.tar.gz
tar -pxzf ${DOWNLOAD_FOLDER}/interproscan-5.45-80.0-64-bit.tar.gz
echo "Done."
cd ${DOWNLOAD_FOLDER}
......@@ -54,13 +54,13 @@ then
exit 12
fi
cd ${ROOT_FOLDER}/interproscan-5.44-79.0/data
cd ${ROOT_FOLDER}/interproscan-5.45-80.0/data
echo "Extracting Panther data..."
tar -pxzf ${DOWNLOAD_FOLDER}/panther-data-14.1.tar.gz
echo "Done."
# Clean up tar to reduce the size of the image
rm -f ${DOWNLOAD_FOLDER}/interproscan-5.44-79.0-64-bit.tar.gz
rm -f ${DOWNLOAD_FOLDER}/interproscan-5.44-79.0-64-bit.tar.gz.md5
rm -f ${DOWNLOAD_FOLDER}/interproscan-5.45-80.0-64-bit.tar.gz
rm -f ${DOWNLOAD_FOLDER}/interproscan-5.45-80.0-64-bit.tar.gz.md5
rm -f ${DOWNLOAD_FOLDER}/panther-data-14.1.tar.gz
rm -f ${DOWNLOAD_FOLDER}/panther-data-14.1.tar.gz.md5
......@@ -37,9 +37,9 @@ ${UNIFIRE_REPO}/distribution/bin/unifire.sh -r ${UNIFIRE_REPO}/samples/unirule-u
-i ${VOLUME}/proteins_lineage-ipr.xml -t ${UNIFIRE_REPO}/samples/unirule-templates-latest.xml \
-o ${VOLUME}/predictions_unirule.out
${UNIFIRE_REPO}/distribution/bin/unifire.sh -r ${UNIFIRE_REPO}/samples/saas-urml-latest.xml \
${UNIFIRE_REPO}/distribution/bin/unifire.sh -r ${UNIFIRE_REPO}/samples/arba-urml-latest.xml \
-i ${VOLUME}/proteins_lineage-ipr.xml \
-o ${VOLUME}/predictions_saas.out
-o ${VOLUME}/predictions_arba.out
${UNIFIRE_REPO}/distribution/bin/unifire.sh -n 100 -r ${UNIFIRE_REPO}/samples/unirule.pirsr-urml-latest.xml \
-i ${VOLUME}/proteins_lineage-ipr-urml.xml -s XML -t ${UNIFIRE_REPO}/samples/pirsr_data/PIRSR_templates.xml \
......@@ -48,7 +48,7 @@ ${UNIFIRE_REPO}/distribution/bin/unifire.sh -n 100 -r ${UNIFIRE_REPO}/samples/un
# prediction output files must belong to the same user and group as proteins.fasta input file
ownership=`stat -c "%u:%g" ${VOLUME}/proteins.fasta`
for outfile in proteins_lineage.fasta proteins_lineage-ipr.xml proteins_lineage-ipr-urml.xml predictions_unirule.out \
predictions_saas.out predictions_unirule-pirsr.out seq aln
predictions_arba.out predictions_unirule-pirsr.out seq aln
do
chown -R ${ownership} ${VOLUME}/${outfile}
done
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment