Commit 45c90bad authored by Mathias Walzer's avatar Mathias Walzer
Browse files

pushing 1st version after restructure from gitlab

parent a6c32aaa
# Pipeline containers
To have the required containers available, they must be built locally. For distribution license reasons this repository does not contain the containers itself but the recipes to easily build locally (requires [singularity](https://sylabs.io/docs/)). Examples can be found in [build_local.sh](build_local.sh).
For the containers found on DockerHub, it is simply:
```
singularity build upstream/openswath:0.1.2.simg docker://openswath/openswath:0.1.2
singularity build upstream/pyprophet_2.1.3.simg docker://pyprophet/pyprophet:2.1.3
```
For a specific container that is not on DockerHub, change into the directories for the chosen container that contains a `*.sdef` file. From there, it is for example:
```
singularity build downstream/pridereswath-downstream.simg python-r:bioc-py3-tinytex-qc_plus_msstats.sdef
```
__Note:__ After containers are built, the workflow [parameter file](../inputs/configs/pridereswath_v1.1.yaml) needs to be adjusted to reflect the location of the respective containers.
#!/usr/bin/env bash
# upstream
singularity build upstream/openswath:0.1.2.simg docker://openswath/openswath:0.1.2
singularity build upstream/pyprophet_2.1.3.simg docker://pyprophet/pyprophet:2.1.3
singularity build upstream/sciex_converter_prideswath.simg container/upstream/wiffconverter/sciex_converter_prideswath.sdef
singularity build upstream/pridereswath.simg container/upstream/pridereswath/pridereswath-upstream.sdef
singularity build upstream/yamato:v1.0.4_pridereswath.simg container/upstream/yamato:v1.0.4_pridereswath.sdef
# downstream
# from docker builds:
#sudo docker build -t python-r:bioc-py3-tinytex-qc_plus_msstats .
#sudo singularity build python-r:bioc-py3-tinytex-qc_plus_msstats.simg docker-daemon://python-r:bioc-py3-tinytex-qc_plus_msstats
# direct builds:
singularity build downstream/pridereswath-downstream.simg downstream/python-r:bioc-py3-tinytex-qc_plus_msstats.sdef
# postprocess
# see downstream
require(tidyverse)
require(stringr)
run_median_norm <- function(data, ...) {
data %>% group_by(originalRUN) %>%
mutate(run_sd = sd(LogIntensities), run_mdn = median(LogIntensities), run_count = n()) %>%
mutate(NormLogIntensities = LogIntensities-run_mdn) %>%
ungroup() %>%
dplyr::select(-run_sd, -run_mdn, -run_count)
}
run_quantile_norm <- function(data,... ){
matrix <- data %>% dplyr::select(Protein, originalRUN, LogIntensities) %>%
tidyr::spread(originalRUN, LogIntensities) %>%
column_to_rownames("Protein") %>%
drop_na() # alt. 0 imputation
# substitution function (according to rank)
index_lookup <- function(i, m){
return(m[i])
}
data_rank <- map_df(matrix,rank,ties.method="average") # 1. rank
data_sorted <- map_df(matrix,sort) # 2. sort
data_mean <- rowMeans(data_sorted) # 3. mean
data_norm <- map_df(data_rank, index_lookup, m=data_mean) # 4. substitute
final <- data_norm %>% #bind_cols(row.names(matrix),.) %>% rename(Protein = ...1) %>%
bind_cols(matrix %>% rownames_to_column("Protein") %>% dplyr::select(Protein), . ) %>%
tidyr::gather(key= "originalRUN", value= "NormLogIntensities", -Protein)
return(data %>% # dplyr::select(-NormLogIntensities) %>% # in case
dplyr::mutate(mergecol = paste(Protein,originalRUN,sep="_")) %>%
dplyr::inner_join(final %>%
dplyr::mutate(mergecol= paste(Protein,originalRUN, sep="_")) %>%
dplyr::select(-originalRUN,-Protein),
by= "mergecol") %>%
dplyr::select(-mergecol)
)
}
run_minmax_norm <- function(data, ...) {
data %>% group_by(originalRUN) %>%
dplyr::mutate(run_min = min(LogIntensities), run_max = max(LogIntensities), run_count = n()) %>%
dplyr::mutate(NormLogIntensities = (LogIntensities-run_min)/(run_max-run_min)) %>%
dplyr::ungroup() %>%
dplyr::select(-run_min, -run_max, -run_count)
}
run_zscore_norm <- function(data, ...) {
# normalisation on log values
# https://www.researchgate.net/post/log_transformation_and_standardization_which_should_come_first
data %>% group_by(originalRUN) %>%
dplyr::mutate(run_sd = sd(LogIntensities), run_mean = mean(LogIntensities), run_count = n()) %>%
dplyr::mutate(NormLogIntensities = (LogIntensities-run_mean)/run_sd) %>%
dplyr::ungroup() %>%
dplyr::select(-run_sd, -run_mean, -run_count)
}
group_consistency <- function(data, ...){
data %>%
dplyr::group_by(Protein,GROUP_ORIGINAL) %>%
dplyr::mutate( sumNumMeasuredFeature = sum(NumMeasuredFeature), sumNumImputedFeature = sum(NumImputedFeature) ) %>%
dplyr::mutate( feature_missingrate_per_group = (100/(sumNumMeasuredFeature+sumNumImputedFeature)) * sumNumImputedFeature ) %>%
dplyr::ungroup() %>%
dplyr::mutate(feature_missingrate_per_run = MissingPercentage * 100) %>%
dplyr::select(-MissingPercentage)
}
clean_protein_names <- function(data, ...) {
data %>%
dplyr::mutate(Protein = str_replace(Protein, "^\\d+/", "")) %>%
dplyr::mutate(Protein = str_replace(Protein, "/$", "")) %>%
dplyr::filter(Protein != "")
}
process_msstats_rda <- function(rda_obj, annot_obj, groups, norm="median") {
if(missing(groups)) {
groups <- as.character((rda_obj$RunlevelData %>% dplyr::distinct(GROUP_ORIGINAL))$GROUP_ORIGINAL)
}
return(our<-goldstandard.proposed$RunlevelData %>%
group_consistency(annot) %>%
clean_protein_names() %>%
dplyr::filter(GROUP_ORIGINAL %in% groups) %>%
{if(norm == "median") run_median_norm(.) else .} %>%
{if(norm == "quantile") run_quantile_norm(.) else .} %>%
{if(norm == "zscore") run_zscore_norm(.) else .} %>%
{if(norm == "minmax") run_minmax_norm(.) else .} %>%
dplyr::full_join(annot, by = c("originalRUN" = "Run")) %>%
dplyr::select(Protein,dplyr::matches("Intensities"),originalRUN,
GROUP_ORIGINAL,SUBJECT_ORIGINAL,
dplyr::matches("missingrate") ) %>%
dplyr::ungroup())
}
#!/usr/bin/env Rscript
library("optparse")
option_list = list(
make_option(c("-t", "--tric"), type="character", default=NULL,
help="The results file from the first OpenSwath analysis pipeline.", metavar="character"),
make_option(c("-a", "--annot"), type="character", default=NULL,
help="annotation.txt file for MSstats", metavar="character"),
make_option(c("-o", "--out"), type="character", default="goldstandard.rda",
help="output file name [default= %default]", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$tric) || is.null(opt$annot) ){
print_help(opt_parser)
stop("Required input arguments must be supplied.", call.=FALSE)
}
#con <- file(opt$tric,"r")
#first_line <- readLines(con,n=1)
#close(con)
#print(first_line)
#con <- file(opt$annot,"r")
#first_line <- readLines(con,n=1)
#close(con)
#print(first_line)
#stop("optparse test stop.")
source("/scripts/OpenSWATHtoMSstatsFormat.R")
library(MSstats)
print ("Loading tric file...")
raw.openMS <- read.csv(opt$tric,
sep= "\t",
header = TRUE,
stringsAsFactors = FALSE)
print ("Loading annotation file...")
annot_osw <- read.csv(opt$annot,
header = TRUE,
sep = "\t")
print ("Parsing annotation to MSstats format...")
input.openms <- OpenSWATHtoMSstatsFormat_new(raw.openMS,
annotation = annot_osw,
removeProtein_with1Feature=TRUE)
print("Run MSstats analysis...")
goldstandard.proposed <-dataProcess(input.openms,
normalization=FALSE,
summaryMethod="TMP",
cutoffCensored="minFeature",
censoredInt="0",
MBimpute=TRUE,
maxQuantileforCensored=0.999)
print("Save the results")
save(goldstandard.proposed, file=opt$out)
---
date: "`r Sys.Date()`"
output:
pdf_document: default
params:
rda: ""
ann: ""
pxdid: "REPLACEME"
---
---
title: `r paste("DIA PRIDEreanalysis (downstream)", params$pxdid, sep = " - ")`
---
## DIA analysis with MSstats
```{r include=FALSE}
# Load required libraries
library('tinytex')
require(tidyverse)
require(stringr)
library(MSstats)
library(data.table)
library(knitr)
library(RColorBrewer)
library(heatmap3)
library(ggplot2)
library(patchwork)
source('/scripts/DIA_downstream_datacarpentry.R')
```
```{r, echo=FALSE}
#Annotations still needed for plots and tables
annot <- read.csv(params$ann,
header = TRUE,
sep = "\t")
#msstats_result get from DIA_downstream_process.R output (rename object)
assign('msstats_result', get(load(file=params$rda)))
prepro <- process_msstats_rda(msstats_result,annot)
write.table(prepro %>%
dplyr::mutate(Intensities = `^`(2,LogIntensities)) %>%
dplyr::select(Protein, originalRUN, Intensities) %>%
tidyr::spread(originalRUN, Intensities),
paste(params$pxdid,"_baseline_raw_untransformed.tsv",sep=""),
quote = FALSE,
sep = "\t",
row.names = FALSE)
protein_matrix <- prepro %>%
dplyr::filter(feature_missingrate_per_group < 50) %>%
dplyr::mutate(NormIntensities = `^`(2,NormLogIntensities)) %>%
dplyr::select(Protein, originalRUN, NormIntensities) %>%
tidyr::spread(originalRUN, NormIntensities)
write.table(protein_matrix,
paste(params$pxdid,"_baseline_mediannorm_untransformed.tsv",sep=""),
quote = FALSE,
sep = "\t",
row.names = FALSE)
```
### Annotation table:
```{r, echo=FALSE}
table_legend = data.frame(
index = seq(length(annot$Run)),
file_names = unlist(tstrsplit(annot$Run, "[.]", keep=1)),
Condition = annot$Condition,
BioReplicate = annot$BioReplicate
)
kable(table_legend)
```
### Detection numbers
```{r, echo=FALSE}
prot_tot_distinct <- prepro %>% dplyr::distinct(Protein) %>% nrow(.)
prot_tot_l50r <- prepro %>% dplyr::filter(feature_missingrate_per_run > 50) %>% dplyr::distinct(Protein) %>% nrow(.)
prot_tot_l50g <- prepro %>% dplyr::filter(feature_missingrate_per_group > 50) %>% dplyr::distinct(Protein) %>% nrow(.)
pep_tot <- length(levels(msstats_result$ProcessedData$PEPTIDE))
irt_peptides <- unique(as.character(msstats_result$ProcessedData[which(msstats_result$ProcessedData$PROTEIN=="1/iRT_protein"),"PEPTIDE"]))
irt_peptides = tryCatch(length(unique(unlist(tstrsplit(irt_peptides, "_", keep=1)))),
error = function(e) 0)
kable(data.frame(
Analysis = c("Total num of proteins",
"Total num of proteins with <50% missing features per group",
"Total num of proteins with <50% missing feature per run",
"Total num of iRT peptides",
"Total num of peptides"),
Measure = c(prot_tot_distinct,
prot_tot_l50g,
prot_tot_l50r,
irt_peptides,
pep_tot)),
col.names = NA)
```
\newpage
### Normalisation inspection
```{r, echo=FALSE, fig.align="center"}
ru <- ggplot(prepro,
aes(x = LogIntensities , y = originalRUN , fill = GROUP_ORIGINAL) ) +
geom_boxplot(outlier.size = 0.5) +
scale_fill_discrete(name="Experimental\nCondition") +
theme(legend.position = "none", axis.text.y=element_text(size=rel(0.5))) +
labs(title=paste("Raw LogIntensities", params$pxdid, sep = " "),x="LogIntensity", y = "Runs")
nu <- ggplot(prepro,
aes(x = NormLogIntensities , y = originalRUN , fill = GROUP_ORIGINAL) ) +
geom_boxplot(outlier.size = 0.5) +
scale_fill_discrete(name="Experimental\nCondition") +
theme(legend.position = "none", axis.text.y=element_text(size=rel(0.5))) +
labs(title=paste("Median normalised LogIntensities", params$pxdid, sep = " "),x="LogIntensity", y = "Runs")
# ru / nu + plot_layout(guides = "collect") & theme(legend.position = "right")
ru
nu
```
### Clustering
```{r echo=FALSE, fig.align="center", dpi=270 , warning=FALSE}
protein_matrix <- protein_matrix %>% column_to_rownames(var = "Protein")
protein_matrix[is.na(protein_matrix)]=0
pca_analysis<-prcomp(t(protein_matrix))
var_expl <- pca_analysis$sdev^2/sum(pca_analysis$sdev^2)
par(mar=c(5.1, 4.1, 4.1, 5.1), xpd=TRUE)
plot(pca_analysis$x, type="n",
main="PCA Analysis:",
xlab = paste0("PC1 (", as.character(round(var_expl[1]*100,1)), "%)"),
ylab = paste0("PC2 (", as.character(round(var_expl[2]*100,1)), "%)"))
text(pca_analysis$x,
cex = 0.7,
label=seq(length(annot$Run)),
col = as.numeric(as.factor(annot$Condition)))
legend("topright", inset=c(-.2,0), legend =levels(as.factor(annot$Condition)),pch=16, pt.cex=1.5, cex=0.75, bty='n',
col = seq_along(levels(as.factor(annot$Condition))))
```
```{r echo=FALSE, fig.align="center", dpi=270 , warning=FALSE}
par(mar=c(5,5,2,2),xaxs = "i",yaxs = "i",cex.axis=0.4,cex.lab=10)
my_group1 <- as.numeric(as.factor(annot$Condition))
Condition <- brewer.pal(9, "Set1")[my_group1]
colSide <-cbind(Condition)
colMain <- colorRampPalette(brewer.pal(10, "Blues"))(25)
cor_results <- cor(protein_matrix)
ColSideAnn<-data.frame(Condition=my_group1,stringsAsFactors=TRUE)
heatmap3(main="Hierarchical Cluster Analysis:",
cor_results,
RowSideColors=colSide,
ColSideColors=colSide,
col=colMain,
labRow = seq(length(annot$Run)),
cexRow = 0.4,
labCol = FALSE,
ColSideAnn=ColSideAnn,
ColSideWidth = 10,
legendfun=function() showLegend(legend=levels(as.factor(annot$Condition)),col=brewer.pal(9, "Set1")[seq_along(levels(as.factor(annot$Condition)))], cex=.9, title="Condition"))
```
```{r echo=FALSE, fig.align="center", dpi=270 , warning=FALSE}
par(mar=c(5,5,2,2),xaxs = "i",yaxs = "i",cex.axis=0.4,cex.lab=10)
my_group1 <- as.numeric(as.factor(annot$Condition))
Condition <- brewer.pal(9, "Set1")[my_group1]
colSide <-cbind(Condition)
colMain <- colorRampPalette(brewer.pal(10, "Blues"))(25)
cor_results <- cor(protein_matrix)
ColSideAnn<-data.frame(Condition=my_group1,stringsAsFactors=TRUE)
heatmap3(cor_results,
RowSideColors=colSide,
ColSideColors=colSide,
col=colMain,
labRow = seq(length(annot$Run)),
cexRow = 0.4,
labCol = FALSE,
ColSideAnn=ColSideAnn,
ColSideWidth = 10,
legendfun=function() showLegend(legend=levels(as.factor(annot$Condition)),col=brewer.pal(9, "Set1")[seq_along(levels(as.factor(annot$Condition)))], cex=.9, title="Condition"))
```
FROM bioconductor/bioconductor_docker:RELEASE_3_12
# bioconductor/bioconductor_docker:RELEASE_3_12 is FROM rocker/rstudio:4.0.3
# (i.e. 2 stages before the rocker image with tex - rocker/verse)
#install some more build tools and libs
RUN apt-get update && apt-get install -y --no-install-recommends --no-install-suggests \
software-properties-common \
git \
vim \
libgomp1 \
zlib1g-dev \
libstdc++6 \
sudo \
build-essential \
libssl-dev \
libcurl4-openssl-dev \
gnupg2 \
python-dev \
python3-dev \
libxml2-dev \
libxslt-dev &&\
rm -rf /var/lib/apt/lists/*
#setup R configs
RUN Rscript -e "install.packages('Rcpp')"
RUN Rscript -e "install.packages('tidyverse')"
RUN Rscript -e "install.packages('dplyr')"
RUN Rscript -e "install.packages('tibble')"
RUN Rscript -e "install.packages('tidyr')"
RUN Rscript -e "install.packages('data.table')"
RUN Rscript -e "install.packages('RColorBrewer')"
RUN Rscript -e "install.packages('heatmap3')"
RUN Rscript -e "install.packages('ggplot2')"
RUN Rscript -e "install.packages('graphics')"
RUN Rscript -e "install.packages('cowplot')"
RUN Rscript -e "install.packages('ggrepel')"
RUN Rscript -e "install.packages('lubridate')"
RUN Rscript -e "install.packages('chron')"
RUN Rscript -e "install.packages('scales')"
RUN Rscript -e "install.packages('grid')"
RUN Rscript -e "install.packages('gridExtra')"
RUN Rscript -e "install.packages('devtools')"
RUN Rscript -e "remotes::install_github('twitter/AnomalyDetection')"
RUN Rscript -e "BiocManager::install('MSstats')"
RUN Rscript -e "install.packages('optparse')"
RUN Rscript -e "install.packages('qcc')"
RUN Rscript -e "install.packages('ggQC')"
RUN Rscript -e "install.packages('MSQC')"
RUN Rscript -e "install.packages('IQCC')"
RUN Rscript -e "install.packages('yhatr')"
RUN Rscript -e "install.packages('pracma')"
RUN Rscript -e "install.packages('anomalize')"
RUN Rscript -e "install.packages('DMwR')"
RUN Rscript -e "install.packages('outliers')"
RUN Rscript -e "install.packages('plotly')"
RUN Rscript -e "install.packages('htmlwidgets')"
RUN Rscript -e "install.packages('tinytex')"
#RUN Rscript -e "tinytex::install_tinytex(version = '2020.11')"
RUN wget -qO- \
"https://github.com/yihui/tinytex/raw/master/tools/install-unx.sh" | \
sh -s - --admin --no-path \
&& mv ~/.TinyTeX /opt/TinyTeX \
&& if /opt/TinyTeX/bin/*/tex -v | grep -q 'TeX Live 2018'; then \
## Patch the Perl modules in the frozen TeX Live 2018 snapshot with the newer
## version available for the installer in tlnet/tlpkg/TeXLive, to include the
## fix described in https://github.com/yihui/tinytex/issues/77#issuecomment-466584510
## as discussed in https://www.preining.info/blog/2019/09/tex-services-at-texlive-info/#comments
wget -P /tmp/ ${CTAN_REPO}/install-tl-unx.tar.gz \
&& tar -xzf /tmp/install-tl-unx.tar.gz -C /tmp/ \
&& cp -Tr /tmp/install-tl-*/tlpkg/TeXLive /opt/TinyTeX/tlpkg/TeXLive \
&& rm -r /tmp/install-tl-*; \
fi \
&& /opt/TinyTeX/bin/*/tlmgr path add \
&& tlmgr install ae inconsolata listings metafont mfware parskip pdfcrop tex \
&& tlmgr path add \
&& Rscript -e "tinytex::r_texmf()" \
&& chown -R root:staff /opt/TinyTeX \
&& chmod -R g+w /opt/TinyTeX \
&& chmod -R g+wx /opt/TinyTeX/bin \
&& echo "PATH=${PATH}" >> /usr/local/lib/R/etc/Renviron
RUN Rscript -e "install.packages('tinytex')"
RUN pip3 install pip --upgrade
RUN pip3 install setuptools --upgrade
RUN pip3 install --force-reinstall 'pytest<=5.0.1'
RUN pip3 install numpy pandas mypy pronto pytest jupyter rpy2 biopython flask
COPY OpenSWATHtoMSstatsFormat.R /scripts/OpenSWATHtoMSstatsFormat.R
COPY SWATHDIA_postprocess.R /scripts/SWATHDIA_postprocess.R
COPY diamsstats_qc.rmd /scripts/diamsstats_qc.rmd
OpenSWATHtoMSstatsFormat_new <- function(input,
annotation = NULL,
filter_with_mscore = TRUE,
mscore_cutoff = 0.01,
useUniquePeptide = TRUE,
fewMeasurements="remove",
removeProtein_with1Feature = FALSE,
summaryforMultipleRows=max){
library(dplyr)
library(tidyr)
if (is.null(fewMeasurements)) {
stop('** Please select \'remove\' or \'keep\' for \'fewMeasurements\'.')
}
if (!is.element(fewMeasurements, c('remove', 'keep'))) {
stop('** Please select \'remove\' or \'keep\' for \'fewMeasurements\'.')
}
if (is.null(annotation)) {
stop('** Please prepare \'annotation\' as one of input.')
} else {
## check annotation
required.annotation <- c('Condition', 'BioReplicate', 'Run')
if (!all(required.annotation %in% colnames(annotation))) {
missedAnnotation <- which(!(required.annotation %in% colnames(annotation)))
stop(paste("**", toString(required.annotation[missedAnnotation]),
"is not provided in Annotation. Please check the annotation file."))
} else {
annotinfo <- annotation
}
}
## check annotation information
## get annotation
## Each Run should has unique information about condition and bioreplicate
check.annot <- xtabs(~Run, annotinfo)
if ( any(check.annot > 1) ) {
stop('** Please check annotation. Each MS run can\'t have multiple conditions or BioReplicates.' )
}
## Check correct option or input
requiredinput.general <- c('ProteinName', 'FullPeptideName', 'Charge', 'Sequence',
'decoy', 'm_score',
'aggr_Fragment_Annotation', 'aggr_Peak_Area',
'filename')
################################
## 1. check general input and use only required columns.
################################
if (!all(requiredinput.general %in% colnames(input))) {
misssing.col <- requiredinput.general[!requiredinput.general %in% colnames(input)]
stop(paste0("** Please check the required input. The required input needs : ",
toString(missing.col)))
} else {
input <- input[, colnames(input) %in% requiredinput.general]
}
##############################
## 2. remove the decoys
##############################
if (length(unique(input$decoy)) == 2) {
## decoy=1 means the decoys.
input <- input[input$decoy == 0, ]
input <- input[, -which(colnames(input) %in% 'decoy')]
message("** Remove the decoys.")
}
##############################
## 3. filter by mscore
##############################
## mscore
if (filter_with_mscore) {
if (!is.element(c('m_score'), colnames(input))) {
stop('** m_score column is needed in order to filter out by m_scoe. Please add m_score column in the input.')
} else {
## when mscore > mscore_cutoff, replace with zero for intensity
input <- input[!is.na(input$m_score) & input$m_score <= mscore_cutoff, ]
message(paste0('** Features with great than ', mscore_cutoff, ' in m_score are removed.'))
}
input <- input[, -which(colnames(input) %in% 'm_score')]
}
##############################
## 4. Make required long format - disaggregate : one row for each transition
##############################
## The columns "aggr_Fragment_Annotation" : separate by ';' and "aggr_Peak_Area" : separate by ';'
## are disaggregated into the new columns "FragmentIon" and "Intensity".
input <- separate_rows(input, "aggr_Fragment_Annotation", "aggr_Peak_Area", sep="[;]")