diff --git a/DESCRIPTION b/DESCRIPTION index bd9b971..3be7a76 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: DAtest Title: Comparing Differential Abundance methods -Version: 2.7.11 +Version: 2.7.12 Authors@R: person("Jakob", "Russel", email = "russel2620@gmail.com", role = c("aut", "cre")) Description: What the title says. Depends: R (>= 3.2.5) @@ -9,4 +9,4 @@ Suggests: lsmeans, eulerr, pscl, samr, statmod License: GPL (>= 3) Encoding: UTF-8 LazyData: true -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index dec25df..e501839 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,8 +7,9 @@ S3method(summary,DA) S3method(summary,DAPower) export(DA.TukeyHSD) export(DA.adx) -export(DA.anc) export(DA.anova) +export(DA.aoa) +export(DA.aoc) export(DA.aov) export(DA.bay) export(DA.drop1) @@ -22,11 +23,15 @@ export(DA.fri) export(DA.kru) export(DA.lao) export(DA.lao2) +export(DA.lia) +export(DA.lic) export(DA.lim) export(DA.lli) export(DA.lli2) export(DA.llm) export(DA.llm2) +export(DA.lma) +export(DA.lmc) export(DA.lrm) export(DA.lsmeans) export(DA.ltt) @@ -43,6 +48,8 @@ export(DA.qua) export(DA.rai) export(DA.sam) export(DA.spe) +export(DA.tta) +export(DA.ttc) export(DA.ttt) export(DA.vli) export(DA.wil) @@ -53,7 +60,9 @@ export(DA.zzz) export(add.tax.DA) export(allDA) export(featurePlot) -export(groupSig) +export(gm_mean) +export(norm_alr) +export(norm_clr) export(powerDA) export(preDA) export(prune.tests.DA) diff --git a/R/DA.anc.R b/R/DA.anc.R deleted file mode 100644 index 06007ab..0000000 --- a/R/DA.anc.R +++ /dev/null @@ -1,52 +0,0 @@ -#' ANCOM -#' -#' Implementation of ANCOM for \code{DAtest} -#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples -#' @param predictor The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation -#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation -#' @param allResults If TRUE will return raw results from the \code{ANCOM} function -#' @param ... Additional arguments for the \code{ANCOM} function -#' @export - -DA.anc <- function(data, predictor, paired = NULL, allResults = FALSE, ...){ - - suppressMessages(library(ancom.R, quietly = TRUE)) - - # Extract from phyloseq - if(class(data) == "phyloseq"){ - DAdata <- DA.phyloseq(data, predictor, paired) - count_table <- DAdata$count_table - predictor <- DAdata$predictor - paired <- DAdata$paired - } else { - count_table <- data - } - - # Ready data - input <- as.data.frame(t(count_table)) - input$Group <- predictor - if(!is.null(paired)) input$ID <- paired - - # Run test - if(is.null(paired)){ - res <- ANCOM(input, ...) - } else { - res <- ANCOM(input, repeated = TRUE, ...) - } - - # Extract results - df <- data.frame(Feature = rownames(count_table), - W = res[[1]], - Detected = factor("No",levels=c("No","Yes"))) - df[df$Feature %in% res[[2]],"Detected"] <- "Yes" - df$Method <- "ANCOM (anc)" - - if(class(data) == "phyloseq") df <- add.tax.DA(data, df) - - if(allResults){ - return(res) - } else { - return(df) - } - -} diff --git a/R/DA.aoa.R b/R/DA.aoa.R new file mode 100644 index 0000000..5954ebc --- /dev/null +++ b/R/DA.aoa.R @@ -0,0 +1,67 @@ +#' ANOVA - Multiplicative zero-correction and additive log-ratio normalization. +#' +#' Apply ANOVA on multiple features with one \code{predictor}. +#' +#' Note: Last feature in the data is used as reference for the log-ratio transformation. +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param covars Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount for zero-correction. Default 1 +#' @param allResults If TRUE will return raw results from the \code{aov} function +#' @param ... Additional arguments for the \code{aov} functions +#' @export + +DA.aoa <- function(data, predictor, covars = NULL, p.adj = "fdr", delta = 1, allResults = FALSE, ...){ + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired = NULL, covars) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + covars <- DAdata$covars + } else { + count_table <- data + } + if(!is.null(covars)){ + for(i in 1:length(covars)){ + assign(names(covars)[i], covars[[i]]) + } + } + + # Define model + if(is.null(covars)){ + form <- paste("x ~ predictor") + } else { + form <- paste("x ~ ",paste(names(covars), collapse="+"),"+ predictor",sep = "") + } + + # Define function + ao <- function(x){ + tryCatch(as.numeric(summary(aov(as.formula(form), ...))[[1]][(length(covars)+1),5]), error = function(e){NA}) + } + + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # ALR transformation + count_table <- norm_alr(count_table) + + # Run tests + if(allResults){ + ao <- function(x){ + tryCatch(aov(as.formula(form), ...), error = function(e){NA}) + } + return(apply(count_table,1,ao)) + } else { + res <- data.frame(pval = apply(count_table,1,ao)) + res$pval.adj <- p.adjust(res$pval, method = p.adj) + res$Feature <- rownames(res) + res$Method <- "ANOVA - ALR (aoa)" + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + return(res) + } +} + + diff --git a/R/DA.aoc.R b/R/DA.aoc.R new file mode 100644 index 0000000..c113a37 --- /dev/null +++ b/R/DA.aoc.R @@ -0,0 +1,65 @@ +#' ANOVA - Multiplicative zero-correction and additive log-ratio normalization. +#' +#' Apply ANOVA on multiple features with one \code{predictor}. +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param covars Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount for zero-correction. Default 1 +#' @param allResults If TRUE will return raw results from the \code{aov} function +#' @param ... Additional arguments for the \code{aov} functions +#' @export + +DA.aoc <- function(data, predictor, covars = NULL, p.adj = "fdr", delta = 1, allResults = FALSE, ...){ + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired = NULL, covars) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + covars <- DAdata$covars + } else { + count_table <- data + } + if(!is.null(covars)){ + for(i in 1:length(covars)){ + assign(names(covars)[i], covars[[i]]) + } + } + + # Define model + if(is.null(covars)){ + form <- paste("x ~ predictor") + } else { + form <- paste("x ~ ",paste(names(covars), collapse="+"),"+ predictor",sep = "") + } + + # Define function + ao <- function(x){ + tryCatch(as.numeric(summary(aov(as.formula(form), ...))[[1]][(length(covars)+1),5]), error = function(e){NA}) + } + + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # ALR transformation + count_table <- norm_clr(count_table) + + # Run tests + if(allResults){ + ao <- function(x){ + tryCatch(aov(as.formula(form), ...), error = function(e){NA}) + } + return(apply(count_table,1,ao)) + } else { + res <- data.frame(pval = apply(count_table,1,ao)) + res$pval.adj <- p.adjust(res$pval, method = p.adj) + res$Feature <- rownames(res) + res$Method <- "ANOVA - CLR (aoc)" + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + return(res) + } +} + + diff --git a/R/DA.bay.R b/R/DA.bay.R index 55fecd7..833e120 100644 --- a/R/DA.bay.R +++ b/R/DA.bay.R @@ -3,12 +3,11 @@ #' Implementation of baySeq for \code{DAtest} #' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples #' @param predictor The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation -#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details #' @param allResults If TRUE will return raw results from the \code{getLikelihoods} function #' @param ... Additional arguments to the \code{getPriors.NB} and \code{getLikelihoods} functions #' @export -DA.bay <- function(data, predictor, p.adj = "fdr", allResults = FALSE, ...){ +DA.bay <- function(data, predictor, allResults = FALSE, ...){ suppressMessages(library(baySeq)) @@ -37,12 +36,9 @@ DA.bay <- function(data, predictor, p.adj = "fdr", allResults = FALSE, ...){ # Extract results tc <- topCounts(CD, group = "DE", number=nrow(count_table)) - tc <- tc[,c(1,rev(ncol(tc)-0:4))] + tc <- tc[,c(1,rev(ncol(tc)-0:3))] - if(is.null(tc$ordering)) - output_df <- data.frame(Feature = as.character(tc$annotation), pval = 1 - tc$Likelihood, pval.adj = p.adjust(1 - tc$Likelihood, method = p.adj)) - if(!is.null(tc$ordering)) - output_df <- data.frame(Feature = as.character(tc$annotation), pval = 1 - tc$Likelihood, pval.adj = p.adjust(1 - tc$Likelihood, method = p.adj), ordering = tc$ordering) + output_df <- data.frame(Feature = as.character(tc$name), pval = (1 - tc$likes), pval.adj = tc$FDR.DE, ordering = tc$DE) output_df$Method <- "baySeq (bay)" diff --git a/R/DA.kru.R b/R/DA.kru.R index 974aa5c..9d03f17 100644 --- a/R/DA.kru.R +++ b/R/DA.kru.R @@ -24,7 +24,7 @@ DA.kru <- function(data, predictor, relative = TRUE, p.adj = "fdr", allResults = # Define function kru <- function(x){ - tryCatch(kruskal.test(as.numeric(x) ~ predictor, ...)$p.value, error = function(e){NA}) + tryCatch(kruskal.test(as.numeric(x) ~ predictor, ...), error = function(e){NA}) } # Relative abundance @@ -35,10 +35,12 @@ DA.kru <- function(data, predictor, relative = TRUE, p.adj = "fdr", allResults = } # Run tests + tests <- apply(count.rel,1,kru) + if(allResults){ - return(apply(count.rel,1,kru)) + return(tests) } else { - res <- data.frame(pval = apply(count.rel,1,kru)) + res <- data.frame(pval = sapply(tests, function(x) x$p.value)) res$pval.adj <- p.adjust(res$pval, method = p.adj) res$Feature <- rownames(res) res$Method <- "Kruskal-Wallis (kru)" diff --git a/R/DA.lia.R b/R/DA.lia.R new file mode 100644 index 0000000..63a94be --- /dev/null +++ b/R/DA.lia.R @@ -0,0 +1,98 @@ +#' LIMMA - Multiplicative zero-correction and additive log-ratio normalization. +#' +#' Note: Last feature in the data is used as reference for the log-ratio transformation. +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param covars Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} +#' @param out.all If TRUE will output results from F-tests, if FALSE t-statistic results from 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount zero-correction. Default 1 +#' @param coeff Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}. +#' @param allResults If TRUE will return raw results from the \code{eBayes} function +#' @param ... Additional arguments for the \code{eBayes} and \code{lmFit} functions +#' @export + +DA.lia <- function(data, predictor, paired = NULL, covars = NULL, out.all = NULL, p.adj = "fdr", delta = 1, coeff = 2, allResults = FALSE, ...){ + + suppressMessages(library(limma)) + if(!is.null(paired)) suppressMessages(library(statmod)) + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired, covars) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + paired <- DAdata$paired + covars <- DAdata$covars + } else { + count_table <- data + } + if(!is.null(covars)){ + for(i in 1:length(covars)){ + assign(names(covars)[i], covars[[i]]) + } + } + + # out.all + if(is.null(out.all)){ + if(length(unique(predictor)) == 2) out.all <- FALSE + if(length(unique(predictor)) > 2) out.all <- TRUE + if(is.numeric(predictor)) out.all <- FALSE + } + + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # ALR transformation + count_table <- as.data.frame(norm_alr(count_table)) + + # Arguments + limma.args <- list(...) + lmFit.args <- limma.args[names(limma.args) %in% names(formals(lmFit))] + eBayes.args <- limma.args[names(limma.args) %in% names(formals(eBayes))] + + if(is.null(covars)){ + form <- paste("~ predictor") + } else { + form <- paste("~ predictor+",paste(names(covars), collapse="+"),sep = "") + } + design <- model.matrix(as.formula(form)) + + # Lienar fit + if(is.null(paired)){ + fit <- do.call(lmFit,c(list(count_table, design),lmFit.args)) + } else { + dupcor <- duplicateCorrelation(count_table, design, block = paired) + fit <- do.call(lmFit,c(list(count_table, design, block = paired, correlation = dupcor$cor),lmFit.args)) + } + + # Empirical bayes + fit.eb <- do.call(eBayes, c(list(fit),eBayes.args)) + + # Extract results + if(is.numeric(predictor[1])){ + res <- topTable(fit.eb, number = nrow(count_table), adjust.method = p.adj, coef = 2) + colnames(res)[4:5] <- c("pval","pval.adj") + } else { + if(out.all){ + res <- topTable(fit.eb, number = nrow(count_table), adjust.method = p.adj, coef = 2:length(levels(as.factor(predictor)))) + colnames(res)[length(levels(as.factor(predictor)))+2:3] <- c("pval","pval.adj") + } else { + res <- topTable(fit.eb, number = nrow(count_table), adjust.method = p.adj, coef = coeff) + colnames(res)[4:5] <- c("pval","pval.adj") + res$ordering <- NA + res[!is.na(res$logFC) & res$logFC > 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff],">",levels(as.factor(predictor))[1]) + res[!is.na(res$logFC) & res$logFC < 0,"ordering"] <- paste0(levels(as.factor(predictor))[1],">",levels(as.factor(predictor))[coeff]) + } + } + + res$Feature <- rownames(res) + res$Method <- "LIMMA - ALR (lia)" + + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + + if(allResults) return(fit.eb) else return(res) +} + diff --git a/R/DA.lic.R b/R/DA.lic.R new file mode 100644 index 0000000..e2faf5a --- /dev/null +++ b/R/DA.lic.R @@ -0,0 +1,98 @@ +#' LIMMA - Multiplicative zero-correction and center log-ratio normalization. +#' +#' Note: Last feature in the data is used as reference for the log-ratio transformation. +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param covars Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} +#' @param out.all If TRUE will output results from F-tests, if FALSE t-statistic results from 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount zero-correction. Default 1 +#' @param coeff Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}. +#' @param allResults If TRUE will return raw results from the \code{eBayes} function +#' @param ... Additional arguments for the \code{eBayes} and \code{lmFit} functions +#' @export + +DA.lic <- function(data, predictor, paired = NULL, covars = NULL, out.all = NULL, p.adj = "fdr", delta = 1, coeff = 2, allResults = FALSE, ...){ + + suppressMessages(library(limma)) + if(!is.null(paired)) suppressMessages(library(statmod)) + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired, covars) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + paired <- DAdata$paired + covars <- DAdata$covars + } else { + count_table <- data + } + if(!is.null(covars)){ + for(i in 1:length(covars)){ + assign(names(covars)[i], covars[[i]]) + } + } + + # out.all + if(is.null(out.all)){ + if(length(unique(predictor)) == 2) out.all <- FALSE + if(length(unique(predictor)) > 2) out.all <- TRUE + if(is.numeric(predictor)) out.all <- FALSE + } + + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # ALR transformation + count_table <- as.data.frame(norm_clr(count_table)) + + # Arguments + limma.args <- list(...) + lmFit.args <- limma.args[names(limma.args) %in% names(formals(lmFit))] + eBayes.args <- limma.args[names(limma.args) %in% names(formals(eBayes))] + + if(is.null(covars)){ + form <- paste("~ predictor") + } else { + form <- paste("~ predictor+",paste(names(covars), collapse="+"),sep = "") + } + design <- model.matrix(as.formula(form)) + + # Lienar fit + if(is.null(paired)){ + fit <- do.call(lmFit,c(list(count_table, design),lmFit.args)) + } else { + dupcor <- duplicateCorrelation(count_table, design, block = paired) + fit <- do.call(lmFit,c(list(count_table, design, block = paired, correlation = dupcor$cor),lmFit.args)) + } + + # Empirical bayes + fit.eb <- do.call(eBayes, c(list(fit),eBayes.args)) + + # Extract results + if(is.numeric(predictor[1])){ + res <- topTable(fit.eb, number = nrow(count_table), adjust.method = p.adj, coef = 2) + colnames(res)[4:5] <- c("pval","pval.adj") + } else { + if(out.all){ + res <- topTable(fit.eb, number = nrow(count_table), adjust.method = p.adj, coef = 2:length(levels(as.factor(predictor)))) + colnames(res)[length(levels(as.factor(predictor)))+2:3] <- c("pval","pval.adj") + } else { + res <- topTable(fit.eb, number = nrow(count_table), adjust.method = p.adj, coef = coeff) + colnames(res)[4:5] <- c("pval","pval.adj") + res$ordering <- NA + res[!is.na(res$logFC) & res$logFC > 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff],">",levels(as.factor(predictor))[1]) + res[!is.na(res$logFC) & res$logFC < 0,"ordering"] <- paste0(levels(as.factor(predictor))[1],">",levels(as.factor(predictor))[coeff]) + } + } + + res$Feature <- rownames(res) + res$Method <- "LIMMA - CLR (lic)" + + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + + if(allResults) return(fit.eb) else return(res) +} + diff --git a/R/DA.lim.R b/R/DA.lim.R index 770c8a0..0dc1797 100644 --- a/R/DA.lim.R +++ b/R/DA.lim.R @@ -13,7 +13,7 @@ #' @export DA.lim <- function(data, predictor, paired = NULL, covars = NULL, relative = TRUE, out.all = NULL, p.adj = "fdr", coeff = 2, allResults = FALSE, ...){ - + suppressMessages(library(limma)) if(!is.null(paired)) suppressMessages(library(statmod)) diff --git a/R/DA.llm.R b/R/DA.llm.R index d2259e6..e62fe00 100644 --- a/R/DA.llm.R +++ b/R/DA.llm.R @@ -10,14 +10,12 @@ #' @param out.all If TRUE will output results and p-values from \code{anova}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise #' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details #' @param delta Numeric. Pseudocount for the log transformation. Default 1 -#' @param coeff Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}. -#' @param coeff.ref Integer. Reference level of the \code{predictor}. Will only affect the log2FC and ordering columns on the output. Default the intercept, = 1 #' @param allResults If TRUE will return raw results from the \code{lm}/\code{lme} function #' @param ... Additional arguments for the \code{lm}/\code{lme} functions #' @import nlme #' @export -DA.llm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRUE, out.all = NULL, p.adj = "fdr", delta = 1, coeff = 2, coeff.ref = 1, allResults = FALSE, ...){ +DA.llm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRUE, out.all = NULL, p.adj = "fdr", delta = 1, allResults = FALSE, ...){ # Extract from phyloseq if(class(data) == "phyloseq"){ @@ -34,10 +32,7 @@ DA.llm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRU assign(names(covars)[i], covars[[i]]) } } - - if(coeff == coeff.ref) stop("coeff and coeff.ref cannot be the same") - if(!coeff %in% 1:length(unique(predictor)) | !coeff.ref %in% 1:length(unique(predictor))) stop(paste("coeff and coeff.ref should be integers between 1 and",length(unique(predictor)))) - + # out.all if(is.null(out.all)){ if(length(unique(predictor)) == 2) out.all <- FALSE @@ -145,13 +140,6 @@ DA.llm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRU } else { res <- as.data.frame(t(as.data.frame(apply(count_table,1,lmr)))) colnames(res)[ncol(res)] <- "pval" - res$log2FC <- log2((res[,coeff.ref]+res[,coeff]) / res[,coeff.ref]) - res[res[,coeff.ref] < 0 & !is.na(res[,coeff.ref]), "log2FC"] <- NA - if(!is.numeric(predictor)){ - res$ordering <- NA - res[!is.na(res[,coeff]) & res[,coeff] > 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff],">",levels(as.factor(predictor))[coeff.ref]) - res[!is.na(res[,coeff]) & res[,coeff] < 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff.ref],">",levels(as.factor(predictor))[coeff]) - } } res$pval.adj <- p.adjust(res$pval, method = p.adj) diff --git a/R/DA.llm2.R b/R/DA.llm2.R index 642cbf0..2b34c5d 100644 --- a/R/DA.llm2.R +++ b/R/DA.llm2.R @@ -9,14 +9,12 @@ #' @param out.all If TRUE will output results and p-values from \code{anova}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise #' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details #' @param delta Numeric. Pseudocount for the log transformation. Default 0.001 -#' @param coeff Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}. -#' @param coeff.ref Integer. Reference level of the \code{predictor}. Will only affect the log2FC and ordering columns on the output. Default the intercept, = 1 #' @param allResults If TRUE will return raw results from the \code{lm}/\code{lme} function #' @param ... Additional arguments for the \code{lm}/\code{lme} functions #' @import nlme #' @export -DA.llm2 <- function(data, predictor, paired = NULL, covars = NULL, out.all = NULL, p.adj = "fdr", delta = 0.001, coeff = 2, coeff.ref = 1, allResults = FALSE, ...){ +DA.llm2 <- function(data, predictor, paired = NULL, covars = NULL, out.all = NULL, p.adj = "fdr", delta = 0.001, allResults = FALSE, ...){ # Extract from phyloseq if(class(data) == "phyloseq"){ @@ -34,9 +32,6 @@ DA.llm2 <- function(data, predictor, paired = NULL, covars = NULL, out.all = NUL } } - if(coeff == coeff.ref) stop("coeff and coeff.ref cannot be the same") - if(!coeff %in% 1:length(unique(predictor)) | !coeff.ref %in% 1:length(unique(predictor))) stop(paste("coeff and coeff.ref should be integers between 1 and",length(unique(predictor)))) - # out.all if(is.null(out.all)){ if(length(unique(predictor)) == 2) out.all <- FALSE @@ -144,13 +139,6 @@ DA.llm2 <- function(data, predictor, paired = NULL, covars = NULL, out.all = NUL } else { res <- as.data.frame(t(as.data.frame(apply(count.rel,1,lmr)))) colnames(res)[ncol(res)] <- "pval" - res$log2FC <- log2((res[,coeff.ref]+res[,coeff]) / res[,coeff.ref]) - res[res[,coeff.ref] < 0 & !is.na(res[,coeff.ref]), "log2FC"] <- NA - if(!is.numeric(predictor)){ - res$ordering <- NA - res[!is.na(res[,coeff]) & res[,coeff] > 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff],">",levels(as.factor(predictor))[coeff.ref]) - res[!is.na(res[,coeff]) & res[,coeff] < 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff.ref],">",levels(as.factor(predictor))[coeff]) - } } res$pval.adj <- p.adjust(res$pval, method = p.adj) diff --git a/R/DA.lma.R b/R/DA.lma.R new file mode 100644 index 0000000..4f02f94 --- /dev/null +++ b/R/DA.lma.R @@ -0,0 +1,155 @@ +#' Linear regression - Multiplicative zero-correction and additive log-ratio normalization. +#' +#' Apply linear regression for multiple features with one \code{predictor}. +#' Mixed-effect model is used when a \code{paired} argument is included, with the \code{paired} variable as a random intercept. +#' Note: Last feature in the data is used as reference for the log-ratio transformation. +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param covars Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} +#' @param out.all If TRUE will output results and p-values from \code{anova}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount for zero-correction. Default 1 +#' @param allResults If TRUE will return raw results from the \code{lm}/\code{lme} function +#' @param ... Additional arguments for the \code{lm}/\code{lme} functions +#' @import nlme +#' @export + +DA.lma <- function(data, predictor, paired = NULL, covars = NULL, out.all = NULL, p.adj = "fdr", delta = 1, allResults = FALSE, ...){ + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired, covars) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + paired <- DAdata$paired + covars <- DAdata$covars + } else { + count_table <- data + } + if(!is.null(covars)){ + for(i in 1:length(covars)){ + assign(names(covars)[i], covars[[i]]) + } + } + + # out.all + if(is.null(out.all)){ + if(length(unique(predictor)) == 2) out.all <- FALSE + if(length(unique(predictor)) > 2) out.all <- TRUE + if(is.numeric(predictor)) out.all <- FALSE + } + + count_table <- as.data.frame.matrix(count_table) + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # ALR transformation + count_table <- norm_alr(count_table) + + # Define design + if(is.null(covars)){ + form <- paste("x ~ predictor") + } else { + form <- paste("x ~ predictor+",paste(names(covars), collapse="+"),sep = "") + } + + # Define function + if(is.null(paired)){ + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lm(as.formula(form), ...), + error = function(e) fit <- NULL) + if(!is.null(fit)) { + if(nrow(coef(summary(fit))) > 1) { + pval <- coef(summary(fit))[coeff,4] + ests <- coef(summary(fit))[,1] + c(ests,pval) + } else NA + } else NA + } + } else { + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lme(as.formula(form), random = ~1|paired, ...), + error = function(e) fit <- NULL) + if(!is.null(fit)) { + if(nrow(coef(summary(fit))) > 1) { + pval <- coef(summary(fit))[coeff,5] + ests <- coef(summary(fit))[,1] + c(ests,pval) + } else NA + } else NA + } + } + + ## for out.all TRUE + if(out.all){ + if(is.null(paired)){ + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lm(as.formula(form),...), + error = function(e) fit <- NULL) + if(!is.null(fit)){ + ests <- coef(summary(fit))[,1] + ano <- anova(fit)[1,] + c(ano,ests) + } + } + } else { + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lme(as.formula(form), random = ~1|paired, ...), + error = function(e) fit <- NULL) + if(!is.null(fit)){ + ests <- coef(summary(fit))[,1] + ano <- anova(fit)[2,] + c(ano,ests) + } + } + } + } + + # Run tests + if(allResults){ + if(is.null(paired)){ + lmr <- function(x){ + fit <- NULL + tryCatch(fit <- lm(as.formula(form), ...), error = function(e) fit <- NULL) + } + } else { + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lme(as.formula(form), random = ~1|paired, ...), error = function(e) fit <- NULL) + } + } + return(apply(count_table,1,lmr)) + } else { + if(out.all){ + if(is.null(paired)){ + res <- as.data.frame(do.call(rbind,apply(count_table,1,lmr))) + colnames(res)[1:5] <- c("Df","Sum Sq","Mean Sq","F value","pval") + } else { + res <- as.data.frame(do.call(rbind,apply(count_table,1,lmr))) + colnames(res)[1:4] <- c("numDF","denDF","F-value","pval") + } + res <- as.data.frame(lapply(res, unlist)) + } else { + res <- as.data.frame(t(as.data.frame(apply(count_table,1,lmr)))) + colnames(res)[ncol(res)] <- "pval" + } + + res$pval.adj <- p.adjust(res$pval, method = p.adj) + res$Feature <- rownames(res) + res$Method <- "Linear model - ALR (lma)" + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + return(res) + } + +} diff --git a/R/DA.lmc.R b/R/DA.lmc.R new file mode 100644 index 0000000..6aecb13 --- /dev/null +++ b/R/DA.lmc.R @@ -0,0 +1,154 @@ +#' Linear regression - Multiplicative zero-correction and center log-ratio normalization. +#' +#' Apply linear regression for multiple features with one \code{predictor}. +#' Mixed-effect model is used when a \code{paired} argument is included, with the \code{paired} variable as a random intercept. +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param covars Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} +#' @param out.all If TRUE will output results and p-values from \code{anova}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount for zero-correction. Default 1 +#' @param allResults If TRUE will return raw results from the \code{lm}/\code{lme} function +#' @param ... Additional arguments for the \code{lm}/\code{lme} functions +#' @import nlme +#' @export + +DA.lmc <- function(data, predictor, paired = NULL, covars = NULL, out.all = NULL, p.adj = "fdr", delta = 1, allResults = FALSE, ...){ + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired, covars) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + paired <- DAdata$paired + covars <- DAdata$covars + } else { + count_table <- data + } + if(!is.null(covars)){ + for(i in 1:length(covars)){ + assign(names(covars)[i], covars[[i]]) + } + } + + # out.all + if(is.null(out.all)){ + if(length(unique(predictor)) == 2) out.all <- FALSE + if(length(unique(predictor)) > 2) out.all <- TRUE + if(is.numeric(predictor)) out.all <- FALSE + } + + count_table <- as.data.frame.matrix(count_table) + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # ALR transformation + count_table <- norm_clr(count_table) + + # Define design + if(is.null(covars)){ + form <- paste("x ~ predictor") + } else { + form <- paste("x ~ predictor+",paste(names(covars), collapse="+"),sep = "") + } + + # Define function + if(is.null(paired)){ + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lm(as.formula(form), ...), + error = function(e) fit <- NULL) + if(!is.null(fit)) { + if(nrow(coef(summary(fit))) > 1) { + pval <- coef(summary(fit))[coeff,4] + ests <- coef(summary(fit))[,1] + c(ests,pval) + } else NA + } else NA + } + } else { + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lme(as.formula(form), random = ~1|paired, ...), + error = function(e) fit <- NULL) + if(!is.null(fit)) { + if(nrow(coef(summary(fit))) > 1) { + pval <- coef(summary(fit))[coeff,5] + ests <- coef(summary(fit))[,1] + c(ests,pval) + } else NA + } else NA + } + } + + ## for out.all TRUE + if(out.all){ + if(is.null(paired)){ + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lm(as.formula(form),...), + error = function(e) fit <- NULL) + if(!is.null(fit)){ + ests <- coef(summary(fit))[,1] + ano <- anova(fit)[1,] + c(ano,ests) + } + } + } else { + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lme(as.formula(form), random = ~1|paired, ...), + error = function(e) fit <- NULL) + if(!is.null(fit)){ + ests <- coef(summary(fit))[,1] + ano <- anova(fit)[2,] + c(ano,ests) + } + } + } + } + + # Run tests + if(allResults){ + if(is.null(paired)){ + lmr <- function(x){ + fit <- NULL + tryCatch(fit <- lm(as.formula(form), ...), error = function(e) fit <- NULL) + } + } else { + lmr <- function(x){ + fit <- NULL + tryCatch( + fit <- lme(as.formula(form), random = ~1|paired, ...), error = function(e) fit <- NULL) + } + } + return(apply(count_table,1,lmr)) + } else { + if(out.all){ + if(is.null(paired)){ + res <- as.data.frame(do.call(rbind,apply(count_table,1,lmr))) + colnames(res)[1:5] <- c("Df","Sum Sq","Mean Sq","F value","pval") + } else { + res <- as.data.frame(do.call(rbind,apply(count_table,1,lmr))) + colnames(res)[1:4] <- c("numDF","denDF","F-value","pval") + } + res <- as.data.frame(lapply(res, unlist)) + } else { + res <- as.data.frame(t(as.data.frame(apply(count_table,1,lmr)))) + colnames(res)[ncol(res)] <- "pval" + } + + res$pval.adj <- p.adjust(res$pval, method = p.adj) + res$Feature <- rownames(res) + res$Method <- "Linear model - CLR (lmc)" + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + return(res) + } + +} diff --git a/R/DA.lrm.R b/R/DA.lrm.R index afe6bae..5458819 100644 --- a/R/DA.lrm.R +++ b/R/DA.lrm.R @@ -9,14 +9,12 @@ #' @param relative Logical. Should \code{data} be normalized to relative abundances. Default TRUE #' @param out.all If TRUE will output results and p-values from \code{anova}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise #' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details -#' @param coeff Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}. -#' @param coeff.ref Integer. Reference level of the \code{predictor}. Will only affect the log2FC and ordering columns on the output. Default the intercept, = 1 #' @param allResults If TRUE will return raw results from the \code{lm}/\code{lme} function #' @param ... Additional arguments for the \code{lm}/\code{lme} functions #' @import nlme #' @export -DA.lrm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRUE, out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, allResults = FALSE, ...){ +DA.lrm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRUE, out.all = NULL, p.adj = "fdr", allResults = FALSE, ...){ # Extract from phyloseq if(class(data) == "phyloseq"){ @@ -34,9 +32,6 @@ DA.lrm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRU } } - if(coeff == coeff.ref) stop("coeff and coeff.ref cannot be the same") - if(!coeff %in% 1:length(unique(predictor)) | !coeff.ref %in% 1:length(unique(predictor))) stop(paste("coeff and coeff.ref should be integers between 1 and",length(unique(predictor)))) - # out.all if(is.null(out.all)){ if(length(unique(predictor)) == 2) out.all <- FALSE @@ -143,13 +138,6 @@ DA.lrm <- function(data, predictor, paired = NULL, covars = NULL, relative = TRU } else { res <- as.data.frame(t(as.data.frame(apply(count_table,1,lmr)))) colnames(res)[ncol(res)] <- "pval" - res$log2FC <- log2((res[,coeff.ref]+res[,coeff]) / res[,coeff.ref]) - res[res[,coeff.ref] < 0 & !is.na(res[,coeff.ref]), "log2FC"] <- NA - if(!is.numeric(predictor)){ - res$ordering <- NA - res[!is.na(res[,coeff]) & res[,coeff] > 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff],">",levels(as.factor(predictor))[coeff.ref]) - res[!is.na(res[,coeff]) & res[,coeff] < 0,"ordering"] <- paste0(levels(as.factor(predictor))[coeff.ref],">",levels(as.factor(predictor))[coeff]) - } } res$pval.adj <- p.adjust(res$pval, method = p.adj) diff --git a/R/DA.tta.R b/R/DA.tta.R new file mode 100644 index 0000000..10ca17f --- /dev/null +++ b/R/DA.tta.R @@ -0,0 +1,82 @@ +#' Welch t-test - Multiplicative zero-correction and additive log-ratio normalization. +#' +#' Apply welch t-test to multiple features with one \code{predictor} +#' +#' Note: Last feature in the data is used as reference for the log-ratio transformation. +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if data is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount for zero-correction. +#' @param testStat Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances)-mean(control abundances)} +#' @param testStat.pair Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances-control abundances)} +#' @param allResults If TRUE will return raw results from the \code{t.test} function +#' @param ... Additional arguments for the \code{t.test} function +#' @export + +DA.tta <- function(data, predictor, paired = NULL, p.adj = "fdr", delta = 1, testStat = function(case,control){mean(case)-mean(control)}, testStat.pair = function(case,control){mean(case-control)}, allResults = FALSE, ...){ + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + paired <- DAdata$paired + } else { + count_table <- data + } + + # Define function + tt <- function(x){ + tryCatch(t.test(x ~ predictor, ...)$p.value, error = function(e){NA}) + } + + # Order data and define function for paired analysis + if(!is.null(paired)){ + count_table <- count_table[,order(paired)] + predictor <- predictor[order(paired)] + testStat <- testStat.pair + tt <- function(x){ + tryCatch(t.test(x ~ predictor, paired = TRUE, ...)$p.value, error = function(e){NA}) + } + } + + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # ALR transformation + count_table <- norm_alr(count_table) + + # Run tests + if(allResults){ + if(is.null(paired)){ + tt <- function(x){ + tryCatch(t.test(x ~ predictor, ...), error = function(e){NA}) + } + } else { + tt <- function(x){ + tryCatch(t.test(x ~ predictor, paired = TRUE, ...), error = function(e){NA}) + } + } + return(apply(count_table,1,tt)) + } else { + res <- data.frame(pval = apply(count_table,1,tt)) + res$pval.adj <- p.adjust(res$pval, method = p.adj) + # Teststat + predictor.num <- as.numeric(as.factor(predictor))-1 + testfun <- function(x){ + case <- x[predictor.num==1] + control <- x[predictor.num==0] + testStat(case,control) + } + res$log2FC <- apply(count_table,1,testfun) + res$ordering <- NA + res[!is.na(res$log2FC) & res$log2FC > 0,"ordering"] <- paste0(levels(as.factor(predictor))[2],">",levels(as.factor(predictor))[1]) + res[!is.na(res$log2FC) & res$log2FC < 0,"ordering"] <- paste0(levels(as.factor(predictor))[1],">",levels(as.factor(predictor))[2]) + res$Feature <- rownames(res) + res$Method <- "t-test - ALR (tta)" + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + return(res) + } +} diff --git a/R/DA.ttc.R b/R/DA.ttc.R new file mode 100644 index 0000000..89013ca --- /dev/null +++ b/R/DA.ttc.R @@ -0,0 +1,80 @@ +#' Welch t-test - Multiplicative zero-correction and center log-ratio normalization. +#' +#' Apply welch t-test to multiple features with one \code{predictor} +#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples +#' @param predictor The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if data is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation +#' @param p.adj Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details +#' @param delta Numeric. Pseudocount for zero-correction. +#' @param testStat Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances)-mean(control abundances)} +#' @param testStat.pair Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances-control abundances)} +#' @param allResults If TRUE will return raw results from the \code{t.test} function +#' @param ... Additional arguments for the \code{t.test} function +#' @export + +DA.ttc <- function(data, predictor, paired = NULL, p.adj = "fdr", delta = 1, testStat = function(case,control){mean(case)-mean(control)}, testStat.pair = function(case,control){mean(case-control)}, allResults = FALSE, ...){ + + # Extract from phyloseq + if(class(data) == "phyloseq"){ + DAdata <- DA.phyloseq(data, predictor, paired) + count_table <- DAdata$count_table + predictor <- DAdata$predictor + paired <- DAdata$paired + } else { + count_table <- data + } + + # Define function + tt <- function(x){ + tryCatch(t.test(x ~ predictor, ...)$p.value, error = function(e){NA}) + } + + # Order data and define function for paired analysis + if(!is.null(paired)){ + count_table <- count_table[,order(paired)] + predictor <- predictor[order(paired)] + testStat <- testStat.pair + tt <- function(x){ + tryCatch(t.test(x ~ predictor, paired = TRUE, ...)$p.value, error = function(e){NA}) + } + } + + # Zero-correction + count_table <- apply(count_table, 2, function(y) sapply(y,function(x) ifelse(x==0,delta,(1-(sum(y==0)*delta)/sum(y))*x))) + if(any(count_table <= 0)) stop("count_table should only contain positive values") + + # CLR transformation + count_table <- norm_clr(count_table) + + # Run tests + if(allResults){ + if(is.null(paired)){ + tt <- function(x){ + tryCatch(t.test(x ~ predictor, ...), error = function(e){NA}) + } + } else { + tt <- function(x){ + tryCatch(t.test(x ~ predictor, paired = TRUE, ...), error = function(e){NA}) + } + } + return(apply(count_table,1,tt)) + } else { + res <- data.frame(pval = apply(count_table,1,tt)) + res$pval.adj <- p.adjust(res$pval, method = p.adj) + # Teststat + predictor.num <- as.numeric(as.factor(predictor))-1 + testfun <- function(x){ + case <- x[predictor.num==1] + control <- x[predictor.num==0] + testStat(case,control) + } + res$log2FC <- apply(count_table,1,testfun) + res$ordering <- NA + res[!is.na(res$log2FC) & res$log2FC > 0,"ordering"] <- paste0(levels(as.factor(predictor))[2],">",levels(as.factor(predictor))[1]) + res[!is.na(res$log2FC) & res$log2FC < 0,"ordering"] <- paste0(levels(as.factor(predictor))[1],">",levels(as.factor(predictor))[2]) + res$Feature <- rownames(res) + res$Method <- "t-test - CLR (ttc)" + if(class(data) == "phyloseq") res <- add.tax.DA(data, res) + return(res) + } +} diff --git a/R/DA.vli.R b/R/DA.vli.R index d3cc3e3..929239a 100644 --- a/R/DA.vli.R +++ b/R/DA.vli.R @@ -16,7 +16,7 @@ DA.vli <- function(data, predictor, paired = NULL, covars = NULL, out.all = NULL suppressMessages(library(limma)) if(!is.null(paired)) suppressMessages(library(statmod)) - + # Extract from phyloseq if(class(data) == "phyloseq"){ DAdata <- DA.phyloseq(data, predictor, paired, covars) diff --git a/R/allDA.R b/R/allDA.R index b0fbda8..5eb0c0b 100644 --- a/R/allDA.R +++ b/R/allDA.R @@ -3,10 +3,10 @@ #' Run many differential abundance and expression tests at a time, to easily compare their results #' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses -#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "ds2x", "lrm", "llm", "llm2", "lim", "lli", "lli2", "zig", "anc", "mva" and "fri" +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} -#' @param tests Character. Which tests to include. Default all (Except ANCOM and mvabund, see below for details) -#' @param relative Logical. If TRUE (default) abundances are made relative for "ttt", "ltt", "wil", "per", "aov", "lao", "kru", "lim", "lli", "lrm", "llm", "spe" and "pea", and there is an offset of \code{log(LibrarySize)} for "mva", "neb", "poi", "qpo", "zpo" and "znb" +#' @param tests Character. Which tests to include. Default all +#' @param relative Logical. TRUE (default) for compositional data. FALSE for absoloute abundances or pre-normalized data. #' @param cores Integer. Number of cores to use for parallel computing. Default one less than available #' @param rng.seed Numeric. Seed for reproducibility. Default 123 #' @param p.adj Character. Method for p-value adjustment. See \code{p.adjust} for details. Default "fdr" @@ -14,99 +14,10 @@ #' @param out.all If TRUE models will output results and p-values from \code{anova}/\code{drop1}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise #' @param alpha q-value threshold for calling significance. Default 0.1 #' @param core.check If TRUE (default) will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers. -#' @details Currently implemented methods: -#' \itemize{ -#' \item per - Permutation test with user defined test statistic -#' \item bay - baySeq -#' \item adx - ALDEx t-test and wilcoxon -#' \item wil - Wilcoxon Rank Sum on relative abundances -#' \item ttt - Welch t.test on relative abundances -#' \item ltt - Welch t.test, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item ltt2 - Welch t.test, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item neb - Negative binomial GLM with log of library size as offset -#' \item erq - EdgeR - Quasi likelihood - TMM normalization -#' \item ere - EdgeR - Exact test - TMM normalization -#' \item erq2 - EdgeR - Quasi likelihood - RLE normalization -#' \item ere2 - EdgeR - Exact test - RLE normalization -#' \item msf - MetagenomeSeq feature model -#' \item zig - MetagenomeSeq zero-inflated gaussian -#' \item ds2 - DESeq2 -#' \item ds2x - DESeq2 with manual geometric means -#' \item lim - LIMMA. Moderated linear models based on emperical bayes -#' \item lli - LIMMA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item lli2 - LIMMA, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item kru - Kruskal-Wallis on relative abundances -#' \item aov - ANOVA on relative abundances -#' \item lao - ANOVA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item lao2 - ANOVA, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item lrm - Linear regression on relative abundances -#' \item llm - Linear regression, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item llm2 - Linear regression, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item rai - RAIDA -#' \item spe - Spearman correlation -#' \item pea - Pearson correlation -#' \item poi - Poisson GLM with log of library size as offset -#' \item qpo - Quasi-Poisson GLM with log of library size as offset -#' \item vli - Limma with voom -#' \item zpo - Zero-inflated Poisson GLM -#' \item znb - Zero-inflated Negative Binomial GLM -#' \item fri - Friedman Rank Sum test -#' \item qua - Quade test -#' \item anc - ANCOM (by default not included, as it is very slow) -#' \item sam - SAMSeq -#' \item mva - mvabund (by default not included, as it is very slow) -#' \item zzz - A user-defined method (See \code{?DA.zzz}) -#' } -#' -#' Additional arguments can be passed to the internal functions with the \code{args} argument. -#' It should be structured as a list with elements named by the tests: -#' E.g. passing to the \code{DA.per} function that it should only run 1000 iterations: \code{args = list(per=list(noOfIterations=1000))}. -#' Include that the log t.test should use a pseudocount of 0.1: \code{args = list(per=list(noOfIterations=1000), ltt=list(delta=0.1))}. -#' Additional arguments are simply seperated by commas. -#' -#' Below is an overview of which functions get the arguments that are passed to a specific test -#' \itemize{ -#' \item per - Passed to \code{DA.per} -#' \item bay - Passed to \code{getPriors}, \code{getLikelihoods} and \code{DA.bay} -#' \item adx - Passed to \code{aldex} and \code{DA.adx} -#' \item wil - Passed to \code{wilcox.test} and \code{DA.wil} -#' \item ttt - Passed to \code{t.test} and \code{DA.ttt} -#' \item ltt - Passed to \code{t.test} and \code{DA.ltt} -#' \item ltt2 - Passed to \code{t.test} and \code{DA.ltt2} -#' \item neb - Passed to \code{glm.nb}, \code{glmer.nb} and \code{DA.neb} -#' \item erq(2) - Passed to \code{calcNormFactors}, \code{estimateDisp}, \code{glmQLFit}, \code{glmQLFTest} and \code{DA.erq} -#' \item ere(2) - Passed to \code{calcNormFactors}, \code{estimateCommonDisp}, \code{estimateTagwiseDisp}, \code{exactTest} and \code{DA.ere} -#' \item msf - Passed to \code{fitFeatureModel} and \code{DA.msf} -#' \item zig - Passed to \code{fitZig} and \code{DA.zig} -#' \item ds2(x) - Passed to \code{DESeq} and \code{DA.ds2} -#' \item lim - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lim} -#' \item lli - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli} -#' \item lli2 - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli2} -#' \item kru - Passed to \code{kruskal.test} and \code{DA.kru} -#' \item aov - Passed to \code{aov} and \code{DA.aov} -#' \item lao - Passed to \code{aov} and \code{DA.lao} -#' \item lao2 - Passed to \code{aov} and \code{DA.lao2} -#' \item lrm - Passed to \code{lm}, \code{lme} and \code{DA.lrm} -#' \item llm - Passed to \code{lm}, \code{lme} and \code{DA.llm} -#' \item llm2 - Passed to \code{lm}, \code{lme} and \code{DA.llm2} -#' \item rai - Passed to \code{raida} and \code{DA.rai} -#' \item spe - Passed to \code{cor.test} and \code{DA.spe} -#' \item pea - Passed to \code{cor.test} and \code{DA.pea} -#' \item poi - Passed to \code{glm}, \code{glmer} and \code{DA.poi} -#' \item qpo - Passed to \code{glm} and \code{DA.qpo} -#' \item vli - Passed to \code{voom}, \code{eBayes}, \code{lmFit} and \code{DA.vli} -#' \item zpo - Passed to \code{zeroinfl} and \code{DA.zpo} -#' \item znb - Passed to \code{zeroinfl} and \code{DA.znb} -#' \item fri - Passed to \code{friedman.test} and \code{DA.fri} -#' \item qua - Passed to \code{quade.test} and \code{DA.qua} -#' \item anc - Passed to \code{ANCOM} and \code{DA.anc} -#' \item sam - Passed to \code{SAMseq} and \code{DA.sam} -#' \item mva - Passed to \code{manyglm} and \code{summary.manyglm} -#' } #' @return A list of results: #' \itemize{ #' \item raw - A data.frame with raw p-values from all methods -#' \item adj - A data.frame with adjusted p-values from all methods (detection/no-detection from anc and sam) +#' \item adj - A data.frame with adjusted p-values from all methods (detection/no-detection from sam) #' \item est - A data.frame with estimates/fold.changes from all relevant methods #' \item details - A dataframe with details from the run #' \item results - A complete list of output from all the methods. Example: Get wilcoxon results from 2. run as such: \code{$results[[2]]["wil"]} @@ -114,7 +25,7 @@ #' #' @export -allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb","per","bay","adx","sam","qua","fri","znb","zpo","vli","qpo","poi","pea","spe","wil","ttt","ltt","ltt2","erq","ere","erq2","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","rai"), relative = TRUE, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE){ +allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("mva","neb","per","bay","adx","sam","qua","fri","znb","zpo","vli","qpo","poi","pea","spe","wil","ttt","ltt","ltt2","erq","ere","erq2","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","rai","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE){ stopifnot(exists("data"),exists("predictor")) # Check for servers @@ -143,16 +54,22 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" if(ncol(count_table) != length(predictor)) stop("Number of samples in count_table does not match length of predictor") if(length(unique(predictor)) < 2) stop("predictor should have at least two levels") + # Remove Features not present in any samples + if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) + count_table <- count_table[rowSums(count_table) > 0,] + # Prune tests argument - decimal <- FALSE + decimal <- zeroes <- FALSE if(!isTRUE(all(unlist(count_table) == floor(unlist(count_table))))) decimal <- TRUE + if(any(count_table == 0)) zeroes <- TRUE tests <- unique(tests) - if(!"zzz" %in% tests) tests <- prune.tests.DA(tests, predictor, paired, covars, relative, decimal) + if(!"zzz" %in% tests) tests <- prune.tests.DA(tests, predictor, paired, covars, relative, decimal, zeroes) if(length(tests) == 0) stop("No tests to run!") - # Remove Features not present in any samples - if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) - count_table <- count_table[rowSums(count_table) > 0,] + # Set seed + message(paste("Seed is set to",rng.seed)) + set.seed(rng.seed) + message(paste("Running on",cores,"cores")) # predictor if(any(is.na(predictor))) warning("Predictor contains NAs!") @@ -166,6 +83,10 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" if(length(levels(as.factor(predictor))) > length(unique(predictor))) stop("predictor has more levels than unique values!") message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", "))) } + if(!is.null(paired)){ + message(paste("The paired variable has",length(unique(paired)),"levels")) + if(length(unique(paired)) < 5) warning("The paired variable has less than 5 levels. Mixed-effect models are excluded") + } # out.all if(is.null(out.all)){ @@ -186,11 +107,8 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" } } - # Set seed - message(paste("Seed is set to",rng.seed)) - set.seed(rng.seed) - # Run tests + cat(paste("Running",length(tests),"methods...\n")) # Progress bar pb <- txtProgressBar(max = length(tests), style = 3) progress <- function(n) setTxtProgressBar(pb, n) @@ -235,6 +153,8 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" wil = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired, relative, p.adj),wil.DAargs)), ttt = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired, relative, p.adj),ttt.DAargs)), ltt = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,relative, p.adj),ltt.DAargs)), + tta = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired, p.adj),tta.DAargs)), + ttc = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired, p.adj),ttc.DAargs)), ltt2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired, p.adj),ltt2.DAargs)), neb = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,relative,out.all, p.adj),neb.DAargs)), erq = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars, out.all, p.adj),erq.DAargs)), @@ -246,17 +166,23 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" ds2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),ds2.DAargs)), ds2x = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),ds2x.DAargs)), per = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired, relative, p.adj),per.DAargs)), - bay = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor, p.adj),bay.DAargs)), + bay = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor),bay.DAargs)), adx = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor),adx.DAargs)), lim = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,relative,out.all, p.adj),lim.DAargs)), lli = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,relative,out.all, p.adj),lli.DAargs)), + lia = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),lia.DAargs)), + lic = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),lic.DAargs)), lli2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),lli2.DAargs)), kru = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor, relative, p.adj),kru.DAargs)), aov = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,covars, relative, p.adj),aov.DAargs)), lao = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,covars,relative, p.adj),lao.DAargs)), + aoa = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,covars, p.adj),aoa.DAargs)), + aoc = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,covars, p.adj),aoc.DAargs)), lao2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,covars, p.adj),lao2.DAargs)), lrm = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars, relative,out.all, p.adj),lrm.DAargs)), llm = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,relative,out.all, p.adj),llm.DAargs)), + lma = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),lma.DAargs)), + lmc = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),lmc.DAargs)), llm2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,covars,out.all, p.adj),llm2.DAargs)), rai = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor, p.adj),rai.DAargs)), spe = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,relative, p.adj),spe.DAargs)), @@ -268,12 +194,11 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" znb = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,covars,relative,out.all, p.adj),znb.DAargs)), fri = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,relative,p.adj),fri.DAargs)), qua = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,relative,p.adj),qua.DAargs)), - anc = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,sig = alpha),anc.DAargs)), sam = do.call(get(noquote(paste0("DA.",i))),c(list(count_table,predictor,paired,fdr.output = alpha),sam.DAargs))), error = function(e) NULL) - if(!is.null(res.sub) & !i %in% c("sam","anc","adx")){ + if(!is.null(res.sub) & !i %in% c("sam","adx")){ res.sub[is.na(res.sub$pval),"pval"] <- 1 res.sub[is.na(res.sub$pval.adj),"pval.adj"] <- 1 } @@ -350,10 +275,6 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" } colnames(df.adj)[ncol(df.adj)] <- "sam" } - if("anc" %in% names(results)) { - df.adj <- merge(df.adj, results$anc[,c("Feature","Detected")], by = "Feature") - colnames(df.adj)[ncol(df.adj)] <- "anc" - } if(class(data) == "phyloseq") df.adj <- add.tax.DA(data, df.adj) } else { df.adj <- NULL @@ -366,9 +287,8 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" qpo = "log2FC", poi = "log2FC", neb = "log2FC", - lrm = "log2FC", - llm = "log2FC", - llm2 = "log2FC", + lia = "log2FC", + lic = "log2FC", vli = "logFC", lim = "logFC", lli = "logFC", @@ -383,6 +303,8 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" ttt = "log2FC", ltt = "log2FC", ltt2 = "log2FC", + tta = "log2FC", + ttc = "log2FC", erq = c("logFC",paste0("logFC.predictor",levels(as.factor(predictor))[2])), ere = "logFC", erq2 = c("logFC",paste0("logFC.predictor",levels(as.factor(predictor))[2])), @@ -458,10 +380,10 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" Relative = relative, OutAll = out.all) rownames(det) <- "" - output.details <- as.data.frame(t(det)) + det <- as.data.frame(t(det)) colnames(det) <- "" - return(list(raw = df.raw, adj = df.adj, est = df.est,details = det, results = newresults)) + return(list(raw = df.raw, adj = df.adj, est = df.est, details = det, results = newresults)) } diff --git a/R/groupSig.R b/R/groupSig.R deleted file mode 100644 index 96a1168..0000000 --- a/R/groupSig.R +++ /dev/null @@ -1,175 +0,0 @@ -#' Are some groups/taxa overrepresented among significant features -#' -#' Test if some groups of features are overpresented among significant features. -#' The groups can be anything; for OTU data e.g. genera/family/order/class/phylum, for transciptomics e.g. KEGG pathway. -#' -#' OR in output is odds ratio from fisher's exact test. If OR is above 1 it means that the group is overrepresented among significant features. -#' @param results Data.frame with results from a \code{DAtest} function -#' @param group.df Data.frame with columns defining the groups. rownames should name the features matching the \code{Feature} column in \code{results}. E.g. \code{tax_table} from a \code{phyloseq} object -#' @param group.cols Numeric vector defining which column(s) contain(s) the groups in \code{group.df}. Default first column. -#' @param split If TRUE will split tests in positive and negative effect sizes if possible. Default TRUE -#' @param alpha Threshold for significance calling. Default 0.05 -#' @param p.adj Method for p-value adjustment. Default "fdr" -#' @param alternative What to test for. "greater" (default) is testing only overrepresentation (OR > 1), "less" only underrepresentation (OR < 1), and "two.sided" tests over- and under-representation (OR != 1) -#' @return A data.frame with odds ratios (OR), p-values, adjusted p-values, groups, name of groups, and direction of effect if split = TRUE -#' @export -groupSig <- function(results, group.df, group.cols = 1, split = TRUE, alpha = 0.05, p.adj = "fdr", alternative = "greater"){ - - stopifnot(exists("results"),exists("group.df")) - - # Name of estimate for different methods - est.name <- list(sam = c("ordering","log2FC"), - znb = c("ordering","log2FC"), - zpo = c("ordering","log2FC"), - qpo = c("ordering","log2FC"), - poi = c("ordering","log2FC"), - neb = c("ordering","log2FC"), - lrm = c("ordering","log2FC"), - llm = c("ordering","log2FC"), - llm2 = c("ordering","log2FC"), - vli = c("ordering","logFC"), - lim = c("ordering","logFC"), - lli = c("ordering","logFC"), - lli2 = c("ordering","logFC"), - pea = "cor", - spe = "rho", - per = c("ordering","log2FC"), - bay = "ordering", - adx.t = c("ordering","effect"), - adx.w = c("ordering","effect"), - wil = c("ordering","log2FC"), - ttt = c("ordering","log2FC"), - ltt = c("ordering","log2FC"), - ltt2 = c("ordering","log2FC"), - erq = c("ordering","log2FC"), - ere = c("ordering","log2FC"), - erq2 = c("ordering","log2FC"), - ere2 = c("ordering","log2FC"), - msf = c("ordering","log2FC"), - zig = c("ordering","log2FC"), - ds2 = c("ordering","log2FoldChange"), - ds2x = c("ordering","log2FoldChange"), - rai = c("ordering","log2FC"), - mva = c("ordering","log2FC")) - - # The method - method <- unique(gsub("\\)","",gsub(".*\\(","",results$Method))) - - # Subset group dataframe - group.df <- as.data.frame(group.df)[,group.cols, drop = FALSE] - - # Fix for anc - if(method == "anc"){ - results$pval.adj <- 1 - results[results$Detected == "Yes","pval.adj"] <- 0 - } - - # Fix for sam - if(method == "sam"){ - if("Sig" %in% colnames(results)){ - results$pval.adj <- 1 - results[results$Sig == "Yes","pval.adj"] <- 0 - } else { - results$pval.adj <- 1 - results[results$Sig.up == "Yes","pval.adj"] <- 0 - results[results$Sig.lo == "Yes","pval.adj"] <- 0 - } - } - - # Significant - feat.sig <- na.omit(results[results$pval.adj < alpha,"Feature"]) - if(length(feat.sig) == 0) stop("No significant features") - group.df$Sig <- 0 - group.df[rownames(group.df) %in% feat.sig,"Sig"] <- 1 - if(all(group.df$Sig == 1)) stop("All features are significant!") - - # The effect size name - split.name <- tryCatch(est.name[[method]], error = function(e) NULL) - if(is.null(split.name)) split.name <- "DAerror" - if(length(split.name) == 2){ - if("ordering" %in% colnames(results)) split.name <- split.name[1] else split.name <- split.name[2] - } - - # For splitting in positive and negative - if(method %in% names(est.name) & split.name %in% colnames(results) & split){ - - # Find positive and negatives - if(split.name != "ordering"){ - pos.feat <- results[results[split.name] > 0,"Feature"] - neg.feat <- results[results[split.name] < 0,"Feature"] - } else { - pos.feat <- results[results$ordering == unique(results$ordering)[1],"Feature"] - neg.feat <- results[results$ordering == unique(results$ordering)[2],"Feature"] - if(length(unique(results$ordering)) == 3) other.feat <- na.omit(results[results$ordering == unique(results$ordering)[3],"Feature"]) - } - pos.feat <- na.omit(pos.feat) - neg.feat <- na.omit(neg.feat) - - # Put effect size in data.frame - group.df$Effect <- NA - if(split.name != "ordering"){ - group.df[rownames(group.df) %in% pos.feat,"Effect"] <- "Positive" - group.df[rownames(group.df) %in% neg.feat,"Effect"] <- "Negative" - } else { - group.df[rownames(group.df) %in% pos.feat,"Effect"] <- as.character(unique(results$ordering)[1]) - group.df[rownames(group.df) %in% neg.feat,"Effect"] <- as.character(unique(results$ordering)[2]) - if(length(unique(results$ordering)) == 3) group.df[rownames(group.df) %in% other.feat,"Effect"] <- as.character(unique(results$ordering)[3]) - } - group.df <- na.omit(group.df) - - # Run the tests - allres <- list() - for(j in 1:(ncol(group.df)-2)){ - subres <- data.frame(Group = NA, - OR = NA, - pval = NA) - group.df[,j] <- paste0(group.df[,j],"_DAtest_",group.df$Effect) - # Table - tab <- table(group.df[,j],group.df$Sig) - for(i in 1:nrow(tab)){ - # The test - fit <- fisher.test(rbind(colSums(tab[-i,]),tab[i,]), alternative = alternative) - subres[i,"Group"] <- rownames(tab[i,,drop = FALSE]) - subres[i,"OR"] <- fit$estimate - subres[i,"pval"] <- fit$p.value - } - subres$GroupName <- colnames(group.df)[j] - subres$Effect <- gsub(".*_DAtest_","",subres$Group) - subres$Group <- gsub("_DAtest_.*","",subres$Group) - allres[[j]] <- subres - } - final <- do.call(rbind,allres) - final <- final[,c(4,1,5,2,3)] - - } else { - message("Note: Results not splitted in positive and negativ effects") - allres <- list() - for(j in 1:(ncol(group.df)-1)){ - subres <- data.frame(Group = NA, - OR = NA, - pval = NA) - # Table - tab <- table(group.df[,j],group.df$Sig) - for(i in 1:nrow(tab)){ - # Collapse table - fit <- fisher.test(rbind(colSums(tab[-i,]),tab[i,]), alternative = alternative) - subres[i,"Group"] <- rownames(tab[i,,drop = FALSE]) - subres[i,"OR"] <- fit$estimate - subres[i,"pval"] <- fit$p.value - } - subres$GroupName <- colnames(group.df)[j] - allres[[j]] <- subres - } - final <- do.call(rbind,allres) - final <- final[,c(4,1,2,3)] - } - - final$pval.adj <- p.adjust(final$pval, method = p.adj) - return(final) -} - - - - - - diff --git a/R/norm.R b/R/norm.R new file mode 100644 index 0000000..00a47d5 --- /dev/null +++ b/R/norm.R @@ -0,0 +1,20 @@ +#' @export +gm_mean = function(x){ + if(any(x < 0, na.rm = TRUE)){ + stop("Negative values not allowed") + } + exp(mean(log(x))) +} + +#' @export +norm_clr <- function(x){ + gm <- apply(x, 2, function(y) gm_mean(y)) + return(t(log(t(x)/gm))) +} + +#' @export +norm_alr <- function(x, ref = nrow(x)){ + return(apply(x, 2, function(y) log(y/y[ref]))[-ref,]) +} + + diff --git a/R/plot.DA.R b/R/plot.DA.R index bf070cd..c948167 100644 --- a/R/plot.DA.R +++ b/R/plot.DA.R @@ -1,7 +1,7 @@ #' Plotting results from \code{testDA} #' #' @param x The output from the \code{testDA} function -#' @param sort Sort methods by median \code{c("AUC","FDR","Spike.detect.rate","Score")} +#' @param sort Sort methods by median \code{c("AUC","FDR","Power","Score")} #' @param p Logical. Should the p-value distribution be plotted (only p-values from non-spiked features) #' @param bins Integer. Number of bins in p-value histograms #' @param ... Additional arguments for \code{ggdraw} @@ -34,13 +34,13 @@ plot.DA <- function(x, sort = "Score", p = FALSE, bins = 20, ...){ ## Find medians auc.median <- aggregate(AUC ~ Method, data = x$table, FUN = function(x) round(median(x),3)) fdr.median <- aggregate(FDR ~ Method, data = x$table, FUN = function(x) round(median(x),3)) - sdr.median <- aggregate(Spike.detect.rate ~ Method, data = x$table, FUN = function(x) round(median(x),3)) + sdr.median <- aggregate(Power ~ Method, data = x$table, FUN = function(x) round(median(x),3)) ## Merge df <- merge(merge(auc.median,fdr.median, by = "Method"),sdr.median, by = "Method") # Score - df$Score <- round((df$AUC-0.5) * df$Spike.detect.rate - df$FDR,3) + df$Score <- round((df$AUC-0.5) * df$Power - df$FDR,3) # Sort the reults if(sort == "AUC") { @@ -49,8 +49,8 @@ plot.DA <- function(x, sort = "Score", p = FALSE, bins = 20, ...){ if(sort == "FDR") { x$table$Method <- factor(x$table$Method, levels = df[order(df$FDR, decreasing = FALSE),"Method"]) } - if(sort == "Spike.detect.rate") { - x$table$Method <- factor(x$table$Method, levels = df[order(df$Spike.detect.rate, decreasing = TRUE),"Method"]) + if(sort == "Power") { + x$table$Method <- factor(x$table$Method, levels = df[order(df$Power, decreasing = TRUE),"Method"]) } if(sort == "Score") { x$table$Method <- factor(x$table$Method, levels = df[order(df$Score, decreasing = TRUE),"Method"]) @@ -79,11 +79,11 @@ plot.DA <- function(x, sort = "Score", p = FALSE, bins = 20, ...){ xlab(NULL) + scale_y_continuous(labels=function(x) sprintf("%.2f", x)) - p3 <- ggplot(x$table, aes(Method, Spike.detect.rate)) + + p3 <- ggplot(x$table, aes(Method, Power)) + theme_bw() + geom_point() + stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,geom = "crossbar",colour="red",width=0.75) + - ylab("Spike detect rate") + + ylab("Power") + theme(axis.text.x = element_blank(), panel.grid.minor = element_blank()) + xlab(NULL) + @@ -99,4 +99,4 @@ plot.DA <- function(x, sort = "Score", p = FALSE, bins = 20, ...){ } -} \ No newline at end of file +} diff --git a/R/plot.DAPower.R b/R/plot.DAPower.R index 4950c67..65ef122 100644 --- a/R/plot.DAPower.R +++ b/R/plot.DAPower.R @@ -17,7 +17,7 @@ plot.DAPower <- function(x, ...){ coord_cartesian(ylim = c(0,1)) + geom_point() + geom_smooth(method = "loess", colour = "red", alpha = 0.4) + - ylab("Spike Detect Rate") + + ylab("Power") + xlab("Log2 Effect Size") + theme(panel.grid.minor = element_blank()) diff --git a/R/posthocs.R b/R/posthocs.R index 5a37fbd..9a98790 100644 --- a/R/posthocs.R +++ b/R/posthocs.R @@ -1,6 +1,6 @@ #' Run \code{drop1} on all features from \code{DAtest} results with \code{allResults = TRUE} #' -#' Works on "zpo", "znb", "qpo", "neb", "poi". Non-paired "lrm", "llm", "llm2" +#' Works on "zpo", "znb", "qpo", "neb", "poi". Non-paired "lrm", "llm", "llm2", "lma", "lmc" #' @param results Output from a \code{DA."test"} function with \code{allResults = TRUE} #' @param test Which test to use to calculate p-values. See \code{drop1} for details. Default "Chisq" #' @param p.adj P-value adjustment method. See \code{p.adjust} for details. Default "fdr" @@ -9,7 +9,7 @@ DA.drop1 <- function(results, test = "Chisq", p.adj = "fdr", ...){ # Check input - if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.zpo, DA.znb, DA.qpo, DA.neb, DA.poi, DA.lrm, DA.llm or DA.llm2 with allResults=TRUE") + if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.zpo, DA.znb, DA.qpo, DA.neb, DA.poi, DA.lrm, DA.lma, DA.lmc, DA.llm or DA.llm2 with allResults=TRUE") # Class k <- 1 @@ -20,7 +20,7 @@ DA.drop1 <- function(results, test = "Chisq", p.adj = "fdr", ...){ # Check class if(any("lme" %in% xclass)) stop("drop1 does not work on mixed-effect linear models. Use DA.anova") - if(!any(c("lm","glm","zeroinfl","negbin","glmerMod") %in% xclass)) stop("results should be the output from DA.zpo, DA.znb, DA.qpo, DA.neb, DA.poi, DA.lrm, DA.llm or DA.llm2 with allResults=TRUE") + if(!any(c("lm","glm","zeroinfl","negbin","glmerMod") %in% xclass)) stop("results should be the output from DA.zpo, DA.znb, DA.qpo, DA.neb, DA.poi, DA.lrm, DA.llm, DA.lma, DA.lmc, or DA.llm2 with allResults=TRUE") # Run tests xres <- lapply(results, function(x) tryCatch(drop1(x, test = test, ...),error = function(e) NA)) @@ -126,7 +126,7 @@ DA.drop1 <- function(results, test = "Chisq", p.adj = "fdr", ...){ #' Run \code{anova} on all features from \code{DAtest} results with \code{allResults = TRUE} #' -#' Works on "lrm", "llm", "llm2". Non-paired "neb" +#' Works on "lrm", "llm", "llm2", "lma", "lmc". Non-paired "neb" #' @param results Output from a \code{DA."test"} function with \code{allResults = TRUE} #' @param p.adj P-value adjustment method. See \code{p.adjust for details}. Default "fdr" #' @param ... Additional arguments for \code{anova} function @@ -134,7 +134,7 @@ DA.drop1 <- function(results, test = "Chisq", p.adj = "fdr", ...){ DA.anova <- function(results, p.adj = "fdr", ...){ # Check input - if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.lrm, DA.llm, DA.llm2 or DA.neb with allResults=TRUE") + if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.lrm, DA.lma, DA.lmc, DA.llm, DA.llm2 or DA.neb with allResults=TRUE") # Class k <- 1 @@ -145,7 +145,7 @@ DA.anova <- function(results, p.adj = "fdr", ...){ # Check class if(any("glmerMod" %in% xclass)) stop("anova does not work on mixed-effect negative binomial models. Use DA.drop1") - if(!any(c("lm","nebgin","lme") %in% xclass)) stop("results should be the output from DA.lrm, DA.llm, DA.llm2 or DA.neb with allResults=TRUE") + if(!any(c("lm","nebgin","lme") %in% xclass)) stop("results should be the output from DA.lrm, DA.lma, DA.lmc, DA.llm, DA.llm2 or DA.neb with allResults=TRUE") # Run tests if(all(xclass == "lme")){ @@ -183,7 +183,7 @@ DA.anova <- function(results, p.adj = "fdr", ...){ #' Run \code{TukeyHSD} on all features from \code{DAtest} results with \code{allResults = TRUE} #' -#' Works on "aov", "lao", "lao2" +#' Works on "aov", "lao", "lao2", "aoc", "aoa" #' @param results Output from a \code{DA."test"} function with \code{allResults = TRUE} #' @param variable Which variable to test. Default predictor. Alternatively, the name of a covar #' @param p.adj P-value adjustment method. See \code{p.adjust} for details @@ -192,7 +192,7 @@ DA.anova <- function(results, p.adj = "fdr", ...){ DA.TukeyHSD <- function(results, variable = "predictor", p.adj = "fdr", ...){ # Check input - if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.aov, DA.lao or DA.lao2 with allResults=TRUE") + if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.aov, DA.aoa, DA.aoc, DA.lao or DA.lao2 with allResults=TRUE") # Class k <- 1 @@ -202,7 +202,7 @@ DA.TukeyHSD <- function(results, variable = "predictor", p.adj = "fdr", ...){ xclass <- class(results[[k]]) # Check class and results - if(xclass[1] != "aov") stop("results should be the output from DA.aov, DA.lao or DA.lao2 with allResults=TRUE") + if(xclass[1] != "aov") stop("results should be the output from DA.aov, DA.aoa, DA.aoc, DA.lao or DA.lao2 with allResults=TRUE") if(!variable %in% attr(results[[k]]$terms,"term.labels")) stop(paste(variable,"not found in the models.")) # Run test @@ -224,20 +224,20 @@ DA.TukeyHSD <- function(results, variable = "predictor", p.adj = "fdr", ...){ #' Run \code{lsmeans} on all features from \code{DAtest} results with \code{allResults = TRUE} #' -#' Pairwise testing on predictor and covars. Works on "poi", "neb", "lrm", "llm", "llm2", "qpo", "znb", "zpo". +#' Pairwise testing on predictor and covars. Works on "poi", "neb", "lrm", "lma", "lmc", "llm", "llm2", "qpo", "znb", "zpo". #' #' Require the \code{lsmeans} package #' @param results Output from a \code{DA."test"} function with \code{allResults = TRUE} #' @param variable Which variable to test. Default predictor. Alternatively, the name of a covar -#' @param predictor If results come from a paired "lrm", "llm" or "llm2" supply the original predictor variable in the form of as a vector -#' @param covars If results come from a paired "lrm", "llm" or "llm2" supply the original covars in the form of a named list +#' @param predictor If results come from a paired "lrm", "llm", "lma", "lmc" or "llm2" supply the original predictor variable in the form of as a vector +#' @param covars If results come from a paired "lrm", "lma", "lmc", "llm" or "llm2" supply the original covars in the form of a named list #' @param p.adj P-value adjustment method. See \code{p.adjust} for details #' @param ... Additional arguments for \code{lsmeans} function #' @export DA.lsmeans <- function(results, variable = "predictor", predictor = NULL, covars = NULL, p.adj = "fdr", ...){ # Check input - if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.poi, DA.neb, DA.lrm, DA.llm, DA.llm2, DA.qpo, DA.znb or DA.zpo with allResults=TRUE") + if(is.data.frame(results) | !is.list(results)) stop("results should be the output from DA.poi, DA.neb, DA.lrm, DA.lma, DA.lmc, DA.llm, DA.llm2, DA.qpo, DA.znb or DA.zpo with allResults=TRUE") library(lsmeans) @@ -249,11 +249,11 @@ DA.lsmeans <- function(results, variable = "predictor", predictor = NULL, covars xclass <- class(results[[k]]) # Check class and extract covars if necessary - if(!any(c("lm","lme","glm","zeroinfl","negbin","glmerMod") %in% xclass)) stop("results should be the output from DA.zpo, DA.znb, DA.qpo, DA.neb, DA.poi, DA.lrm, DA.llm or DA.llm2 with allResults=TRUE") + if(!any(c("lm","lme","glm","zeroinfl","negbin","glmerMod") %in% xclass)) stop("results should be the output from DA.zpo, DA.znb, DA.qpo, DA.neb, DA.poi, DA.lrm, DA.lma, DA.lmc, DA.llm or DA.llm2 with allResults=TRUE") if(class(results[[k]])[1] == "lme"){ form <<- as.formula(paste("x ~",paste(attr(results[[1]]$terms,"term.labels"), collapse = "+"))) - if(is.null(predictor)) stop("predictor has to be supplied for a paired lrm, llm and llm2") + if(is.null(predictor)) stop("predictor has to be supplied for a paired lrm, lma, lmc, llm and llm2") if(!is.null(covars)){ for(i in 1:length(covars)){ assign(names(covars)[i],covars[[i]]) diff --git a/R/powerDA.R b/R/powerDA.R index 72b3258..280942b 100644 --- a/R/powerDA.R +++ b/R/powerDA.R @@ -3,20 +3,20 @@ #' Estimating (empirical) statistical power for a specific differential abundance and expression method on a specific dataset #' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses -#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "anc", "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "ds2x", "lrm", "llm", "llm2", "lim", "lli", "lli2", "mva", and "zig" +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} #' @param test Character. Which test to include. See \code{testDA} for details on the implemented tests. #' @param effectSizes Numeric. The effect sizes for the spike-ins. Default \code{c(2,4,8,16,32)} #' @param alpha.p p-value threshold for false positive rates. Default 0.05 -#' @param alpha.q q-value threshold for determining significance for \code{empirical power}. Default 0.1. This will change \code{fdr.output} for "sam" and \code{sig} for "anc". +#' @param alpha.q q-value threshold for determining significance for \code{empirical power}. Default 0.1. This will change \code{fdr.output} for "sam". #' @param p.adj Character. Method for p-value adjustment. See \code{p.adjust} for details. Default "fdr" #' @param R Integer. Number of times to run the tests. Default 5 -#' @param relative Logical. If TRUE (default) abundances are made relative for "ttt", "ltt", "wil", "per", "aov", "lao", "kru", "lim", "lli", "lrm", "llm", "spe" and "pea", and there is an offset of \code{log(LibrarySize)} for "mva", "neb", "poi", "qpo", "zpo" and "znb" +#' @param relative Logical. TRUE (default) for compositional data, FALSE for absolute abundances or pre-normalized data. #' @param k Vector of length 3. Number of Features to spike in each tertile (lower, mid, upper). E.g. \code{k=c(5,10,15)}: 5 features spiked in low abundance tertile, 10 features spiked in mid abundance tertile and 15 features spiked in high abundance tertile. Default NULL, which will spike 2 percent of the total amount of features in each tertile (a total of 6 percent), but minimum c(5,5,5) #' @param cores Integer. Number of cores to use for parallel computing. Default one less than available. Set to 1 for sequential computing. #' @param rng.seed Numeric. Seed for reproducibility. Default 123 #' @param args List. A list with lists of arguments passed to the different methods. See details for more. -#' @param out.all If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and lim, lli, lli2, lim, vli will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise +#' @param out.all If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and limma will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise #' @param core.check If TRUE will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers. #' @param verbose If TRUE will print informative messages #' @details Currently implemented methods: see \code{testDA} @@ -69,6 +69,7 @@ powerDA <- function(data, predictor, paired = NULL, covars = NULL, test = NULL, # Set seed set.seed(rng.seed) if(verbose) message(paste("Seed is set to",rng.seed)) + if(verbose) message(paste("Running on",cores,"cores")) # Create some random seeds for each run seeds <- rpois(R, lambda = (1:R)*1e6) @@ -106,6 +107,9 @@ powerDA <- function(data, predictor, paired = NULL, covars = NULL, test = NULL, if(length(levels(as.factor(predictor))) > length(unique(predictor))) stop("predictor has more levels than unique values!") if(verbose) message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", "))) } + if(!is.null(paired)){ + if(verbose) message(paste("The paired variable has",length(unique(paired)),"levels")) + } # out.all if(is.null(out.all)){ @@ -195,6 +199,8 @@ powerDA <- function(data, predictor, paired = NULL, covars = NULL, test = NULL, wil = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired, relative,p.adj),wil.DAargs)), ttt = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired, relative,p.adj),ttt.DAargs)), ltt = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,relative,p.adj),ltt.DAargs)), + ttc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,p.adj),ttc.DAargs)), + tta = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,p.adj),tta.DAargs)), ltt2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,p.adj),ltt2.DAargs)), neb = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,relative,out.all,p.adj),neb.DAargs)), erq = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),erq.DAargs)), @@ -206,16 +212,22 @@ powerDA <- function(data, predictor, paired = NULL, covars = NULL, test = NULL, ds2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),ds2.DAargs)), ds2x = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),ds2x.DAargs)), per = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired, relative,p.adj),per.DAargs)), - bay = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,p.adj),bay.DAargs)), + bay = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]]),bay.DAargs)), adx = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]]),adx.DAargs)), lim = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,relative,out.all,p.adj),lim.DAargs)), + lic = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),lic.DAargs)), + lia = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),lia.DAargs)), lli = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,relative,out.all,p.adj),lli.DAargs)), lli2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),lli2.DAargs)), kru = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],relative,p.adj),kru.DAargs)), + aoa = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],covars,p.adj),aoa.DAargs)), + aoc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],covars,p.adj),aoc.DAargs)), aov = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],covars,relative,p.adj),aov.DAargs)), lao = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],covars,relative,p.adj),lao.DAargs)), lao2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],covars,p.adj),lao2.DAargs)), lrm = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars, relative,out.all,p.adj),lrm.DAargs)), + lmc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),lmc.DAargs)), + lma = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),lma.DAargs)), llm = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,relative,out.all,p.adj),llm.DAargs)), llm2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,covars,out.all,p.adj),llm2.DAargs)), rai = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],p.adj),rai.DAargs)), @@ -228,12 +240,11 @@ powerDA <- function(data, predictor, paired = NULL, covars = NULL, test = NULL, znb = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],covars,relative,out.all,p.adj),znb.DAargs)), fri = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,relative,p.adj),fri.DAargs)), qua = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,relative,p.adj),qua.DAargs)), - anc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,sig = alpha.q),anc.DAargs)), sam = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[what.run]],rands[[run.no]],paired,fdr.output = alpha.q),sam.DAargs))), error = function(e) NULL) - if(!i %in% c("anc","sam","adx")){ + if(!i %in% c("sam","adx")){ res.sub[is.na(res.sub$pval),"pval"] <- 1 res.sub[is.na(res.sub$pval.adj),"pval.adj"] <- 1 } @@ -264,14 +275,6 @@ powerDA <- function(data, predictor, paired = NULL, covars = NULL, test = NULL, res.sub <- results[names(results) == r][[1]] - # Make pseudo-pval for ANCOM - if(test == "anc"){ - suppressWarnings(min.d <- min(res.sub[res.sub$Detected == "Yes","W"])) - if(min.d == Inf) min.d <- max(res.sub$W)+1 - res.sub$pval <- 1/(res.sub$W+1) * alpha.p/(1/(min.d+1)) - res.sub$pval.adj <- 1/(res.sub$W+1) * alpha.q/(1/(min.d+1)) - } - # Make pseudo-pval for SAMseq if(test == "sam"){ res.sub$pval <- 1/rank(res.sub$Score) @@ -331,7 +334,7 @@ powerDA <- function(data, predictor, paired = NULL, covars = NULL, test = NULL, AUC = auc, FPR = fpr, FDR = fdr, - SDR = sdr) + Power = sdr) rownames(df.combined) <- NULL return(df.combined) diff --git a/R/prune.tests.DA.R b/R/prune.tests.DA.R index 7e10c6d..5279dba 100644 --- a/R/prune.tests.DA.R +++ b/R/prune.tests.DA.R @@ -6,9 +6,10 @@ #' @param covars Named list with covariables #' @param relative Include tests that work with relative abundances (TRUE) or only absolute abundances (FALSE) #' @param decimal Exclude tests that do not work with decimals (TRUE) +#' @param zeroes If FALSE will exclude tests that include zero-inflation #' @export -prune.tests.DA <- function(tests, predictor, paired, covars, relative, decimal){ +prune.tests.DA <- function(tests, predictor, paired, covars, relative, decimal, zeroes){ # Prune test argument if packages are not installed if(!"baySeq" %in% rownames(installed.packages())) tests <- tests[tests != "bay"] @@ -16,21 +17,25 @@ prune.tests.DA <- function(tests, predictor, paired, covars, relative, decimal){ if(!"edgeR" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("ere","erq","ere2","erq2")] if(!"metagenomeSeq" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("msf","zig")] if(!"DESeq2" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("ds2","ds2x")] - if(!"limma" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("lim","lli","lli2","vli")] - if(!"statmod" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("lim","lli","lli2","vli")] + if(!"limma" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("lim","lli","lli2","vli","lia","lic")] + if(!"statmod" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("lim","lli","lli2","vli","lia","lic")] if(!"RAIDA" %in% rownames(installed.packages())) tests <- tests[tests != "rai"] if(!"pscl" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("zpo","znb")] - if(!"ancom.R" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("anc")] if(!"samr" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("sam")] - if(!"mvabund" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("mvabund")] + if(!"mvabund" %in% rownames(installed.packages())) tests <- tests[!tests %in% c("mva")] # Exclude tests that do not work with a paired argument if(!is.null(paired)){ - tests <- tests[!tests %in% c("qpo","zpo","znb","bay","adx","ere","ere2","msf","aov","lao","lao2","kru","rai","spe","pea")] + tests <- tests[!tests %in% c("qpo","zpo","znb","bay","adx","ere","ere2","msf","aov","lao","lao2","aoa","aoc","kru","rai","spe","pea")] # Exclude tests that only work with one value for each combination of predictor and paired arguments if(!all(table(paired,predictor) == 1)){ - tests <- tests[!tests %in% c("ttt","ltt","ltt2","wil","per","fri","qua","sam")] + tests <- tests[!tests %in% c("ttt","ltt","ltt2","wil","per","fri","qua","sam","tta","ttc")] } + # Exclude if too few levels + if(length(unique(levels(paired))) < 5){ + tests <- tests[!tests %in% c("lrm","llm","llm2","lim","lli","lli2","vli","neb","poi","zig","lma","lmc","lia","lic")] + } + } else { # Exclude if there is no paired tests <- tests[!tests %in% c("fri","qua")] @@ -38,19 +43,19 @@ prune.tests.DA <- function(tests, predictor, paired, covars, relative, decimal){ # Only include some tests if there are more than two levels in predictor if(length(levels(as.factor(predictor))) > 2){ - tests <- tests[tests %in% c("mva","sam","anc","qua","fri","znb","zpo","vli","qpo","poi","neb","erq","erq2","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe","pea","zig")] + tests <- tests[tests %in% c("bay","sam","qua","fri","znb","zpo","vli","qpo","poi","neb","erq","erq2","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","aoa","aoc","kru","lrm","llm","llm2","spe","pea","zig","lma","lmc","lia","lic")] # Exclude if only works for two-class paired if(!is.null(paired)){ tests <- tests[!tests %in% c("sam")] } } else { # Excluded tests if levels in predictor is exactly 2 - tests <- tests[!tests %in% c("aov","lao","lao2","kru","spe","pea","fri","qua","lrm","llm","llm2")] + tests <- tests[!tests %in% c("aov","lao","lao2","aoa","aoc","kru","spe","pea","fri","qua","lrm","llm","llm2","lma","lmc")] } # Only include specific tests if predictor is numeric if(is.numeric(predictor)){ - tests <- tests[tests %in% c("mva","sam","znb","zpo","vli","qpo","poi","neb","erq","erq2","lim","lli","lli2","lrm","llm","llm2","spe","pea")] + tests <- tests[tests %in% c("mva","sam","znb","zpo","vli","qpo","poi","neb","erq","erq2","lim","lli","lli2","lrm","llm","llm2","spe","pea","lma","lmc","lia","lic")] } else { # Exclude if not numeric tests <- tests[!tests %in% c("spe","pea")] @@ -58,7 +63,7 @@ prune.tests.DA <- function(tests, predictor, paired, covars, relative, decimal){ # Exclude if relative is false if(relative == FALSE){ - tests <- tests[!tests %in% c("sam","anc","vli","ltt2","erq","ere","ere2","erq2","msf","zig","bay","ds2","ds2x","adx","lli2","lao2","llm2","rai")] + tests <- tests[!tests %in% c("sam","vli","ltt2","erq","ere","ere2","erq2","msf","zig","bay","ds2","ds2x","adx","lli2","lao2","aoa","aoc","llm2","rai","tta","ttc","lma","lmc","lia","lic")] } else { # Exclude if relative is TRUE tests <- tests[!tests %in% c("lrm","lim")] @@ -71,7 +76,12 @@ prune.tests.DA <- function(tests, predictor, paired, covars, relative, decimal){ # Only include if covars are present if(!is.null(covars)){ - tests <- tests[tests %in% c("mva","znb","zpo","vli","qpo","poi","ds2","ds2x","neb","erq","erq2","zig","lrm","llm","llm2","lim","lli","lli2","aov","lao","lao2")] + tests <- tests[tests %in% c("mva","znb","zpo","vli","qpo","poi","ds2","ds2x","neb","erq","erq2","zig","lrm","llm","llm2","lim","lli","lli2","aov","lao","lao2","aoa","aoc","lma","lmc","lia","lic")] + } + + # Exclude if no zeroes + if(!zeroes){ + tests <- tests[!tests %in% c("znb","zpo")] } return(tests) diff --git a/R/runtimeDA.R b/R/runtimeDA.R index d998f2a..aa4d1cb 100644 --- a/R/runtimeDA.R +++ b/R/runtimeDA.R @@ -1,13 +1,13 @@ #' Estimate runtime of \code{testDA} on large datasets #' -#' Estimate the runtime of \code{testDA} from running on a subset of the features. Intended for datasets with at least 5000 features. +#' Estimate the runtime of \code{testDA} from running on a subset of the features. Intended for datasets with at least 2000 features. #' #' Outputs the estimated times for running each method 1 time. With cores=1 the runtime will be the sum of them all. With more cores the actual runtime will decrease asymptotically towards the slowest test #' #' Runtime of all methods are expected to scale linearly with the number of features, except "anc" and "bay" which are modelled with a 2. order polynomial. #' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses -#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "anc", "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "lrm", "llm", "llm2", "lim", "lli", "lli2" and "zig" +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "lrm", "llm", "llm2", "lim", "lli", "lli2" and "zig" #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} #' @param subsamples Vector with numbers of features to subsample to estimate runtime for fast methods #' @param subsamples.slow Vector with numbers of features to subsample to estimate runtime for slow methods @@ -19,8 +19,8 @@ #' @importFrom parallel detectCores #' @export runtimeDA <- function(data, predictor, paired = NULL, covars = NULL, subsamples = c(500,1000,1500,2000), subsamples.slow = c(100,150,200,250), - tests = c("sam", "qua", "fri", "vli", "qpo", "pea", "wil", "ttt", "ltt", "ltt2","ere", "ere2", "msf", "zig", "lim", "lli", "lli2", "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe"), - tests.slow = c("neb", "bay", "per", "zpo", "znb", "rai", "adx", "ds2", "ds2x", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){ + tests = c("sam", "qua", "fri", "vli", "qpo", "pea", "wil", "ttt", "ltt", "ltt2","ere", "ere2", "msf", "zig", "lim", "lli", "lli2", "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe", "aoa", "aoc", "aoi", "tta", "ttc", "tti", "lma", "lmc", "lmi", "lia", "lic", "lii"), + tests.slow = c("mva", "neb", "bay", "per", "zpo", "znb", "rai", "adx", "ds2", "ds2x", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){ stopifnot(exists("data"),exists("predictor")) @@ -125,7 +125,7 @@ runtimeDA <- function(data, predictor, paired = NULL, covars = NULL, subsamples for(i in all.tests){ extra.sub <- runtimes[runtimes$Test == i,] - if(i %in% c("anc","bay")){ + if(i %in% c("bay")){ fit <- lm(Minutes ~ poly(SubSamp,2), data = as.data.frame(extra.sub)) } else { fit <- lm(Minutes ~ SubSamp, data = as.data.frame(extra.sub)) diff --git a/R/summary.DA.R b/R/summary.DA.R index fb3b104..b16f1f1 100644 --- a/R/summary.DA.R +++ b/R/summary.DA.R @@ -1,56 +1,65 @@ #' Summary of results from \code{testDA} #' #' @param object The output from the \code{testDA} function -#' @param sort Sort methods by \code{c("AUC","FPR","Spike.detect.rate","Score")} +#' @param sort Sort methods by \code{c("AUC","FPR","Power","Score")} #' @param boot If TRUE will use bootstrap for confidence limits of the Score, else will compute the limits from the original table. Recommended to be TRUE unless \code{R >= 100} in \code{testDA} #' @param prob Confidence limits for Score. Default \code{90\%} = \code{c(0.05,0.095)} #' @param N Number of bootstraps. Default 1000 #' @param boot.seed Random seed for reproducibility of bootstraps +#' @param decimals Precision in output. Default 2 #' @param ... Additional arguments for \code{print} #' @import stats #' @import methods #' @import utils #' @export -summary.DA <- function(object, sort = "Score", boot = TRUE, prob = c(0.05,0.95), N = 1000, boot.seed = 1, ...){ +summary.DA <- function(object, sort = "Score", boot = TRUE, prob = c(0.05,0.95), N = 1000, boot.seed = 1, decimals = 2, ...){ # Find medians - output.summary.auc <- aggregate(AUC ~ Method, data = object$table, FUN = function(x) round(median(x),3)) - output.summary.fpr <- aggregate(FPR ~ Method, data = object$table, FUN = function(x) round(median(x),3)) - output.summary.sdr <- aggregate(Spike.detect.rate ~ Method, data = object$table, FUN = function(x) round(median(x),3)) - output.summary.fdr <- aggregate(FDR ~ Method, data = object$table, FUN = function(x) round(median(x),3)) + output.summary.auc <- aggregate(AUC ~ Method, data = object$table, FUN = median) + output.summary.fpr <- aggregate(FPR ~ Method, data = object$table, FUN = median) + output.summary.sdr <- aggregate(Power ~ Method, data = object$table, FUN = median) + output.summary.fdr <- aggregate(FDR ~ Method, data = object$table, FUN = median) # Merge df <- merge(merge(merge(output.summary.auc,output.summary.fpr, by = "Method", all = TRUE),output.summary.fdr, by = "Method"),output.summary.sdr, by = "Method") # Score - df$Score <- round((df$AUC-0.5) * df$Spike.detect.rate - df$FDR,3) + df$Score <- (df$AUC-0.5) * df$Power - df$FDR # Interval - object$table$Score <- (object$table$AUC-0.5) * object$table$Spike.detect.rate - object$table$FDR + object$table$Score <- (object$table$AUC-0.5) * object$table$Power - object$table$FDR if(boot){ set.seed(boot.seed) boots <- lapply(unique(object$table$Method), function(x) object$table[object$table$Method == x,][ sample(rownames(object$table[object$table$Method == x,]),N,replace = TRUE), ]) - boot.score <- lapply(boots,function(y) aggregate(Score ~ Method, data = y, FUN = function(x) round(quantile(x,probs = prob),3))) + boot.score <- lapply(boots,function(y) aggregate(Score ~ Method, data = y, FUN = function(x) quantile(x,probs = prob))) score.cl <- do.call(rbind,boot.score) } else { - score.cl <- aggregate(Score ~ Method, data = object$table, FUN = function(x) round(quantile(x,probs = prob),3)) + score.cl <- aggregate(Score ~ Method, data = object$table, FUN = function(x) quantile(x,probs = prob)) } df <- merge(df, as.matrix(score.cl), by = "Method") + df <- df[order(df$Score,df[,7],df[,8], decreasing = TRUE),] + if(df[1,"Score"] == 0) warning("Best Score is equal to zero!\nYou might want to re-run with a higher effectSize or pruned dataset (see preDA)") + + df <- cbind(data.frame(Method = df[,1]),apply(df[,2:ncol(df)],2,function(x) round(as.numeric(x), decimals))) + + mat <- c(FALSE) + for(i in 1:nrow(df)){ + mat[i] <- df[i,8] >= df[1,7] + } + df$` ` <- " " + df[mat,]$` ` <- "*" # Sort if(sort == "AUC") df <- df[order(df$AUC, decreasing = TRUE),] if(sort == "FPR") df <- df[order(df$FPR, decreasing = FALSE),] - if(sort == "Spike.detect.rate") df <- df[order(df$Spike.detect.rate, decreasing = TRUE),] - if(sort == "Score") { - df <- df[order(df$Score,df[,7],df[,8], decreasing = TRUE),] - if(df[1,"Score"] <= 0) warning("Best Score is less or equal to zero!\nYou might want to rerun with a higher effectSize or pruned dataset (see preDA)") - } - + if(sort == "Power") df <- df[order(df$Power, decreasing = TRUE),] + print(df, row.names = FALSE, ...) } + diff --git a/R/summary.DAPower.R b/R/summary.DAPower.R index 07e02ee..4ba3fa1 100644 --- a/R/summary.DAPower.R +++ b/R/summary.DAPower.R @@ -9,17 +9,17 @@ summary.DAPower <- function(x, ...){ if(x$Method[1] == "SAMseq (sam)"){ # Find medians - output.summary.sdr <- aggregate(SDR ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) + output.summary.sdr <- aggregate(Power ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.auc <- aggregate(AUC ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.fdr <- aggregate(FDR ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) # Merge df <- merge(merge(output.summary.sdr,output.summary.auc, by = "EffectSize"),output.summary.fdr, by = "EffectSize") - colnames(df) <- c("EffectSize","Spike.detect.rate","AUC","FDR") + colnames(df) <- c("EffectSize","Power","AUC","FDR") } else { if(x$Method[1] %in% c("ALDEx2 t-test (adx)","ALDEx2 wilcox (adx)")){ # Find medians - output.summary.sdr <- aggregate(SDR ~ Method + EffectSize, data = x, FUN = function(y) round(median(y),3)) + output.summary.sdr <- aggregate(Power ~ Method + EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.auc <- aggregate(AUC ~ Method + EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.fpr <- aggregate(FPR ~ Method + EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.fdr <- aggregate(FDR ~ Method + EffectSize, data = x, FUN = function(y) round(median(y),3)) @@ -27,17 +27,17 @@ summary.DAPower <- function(x, ...){ # Merge df <- cbind(output.summary.sdr,output.summary.auc[,3],output.summary.fpr[,3],output.summary.fdr[,3]) df <- as.data.frame(df[order(df$Method),]) - colnames(df) <- c("Method","EffectSize","Spike.detect.rate","AUC","FPR","FDR") + colnames(df) <- c("Method","EffectSize","Power","AUC","FPR","FDR") } else { # Find medians - output.summary.sdr <- aggregate(SDR ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) + output.summary.sdr <- aggregate(Power ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.auc <- aggregate(AUC ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.fpr <- aggregate(FPR ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) output.summary.fdr <- aggregate(FDR ~ EffectSize, data = x, FUN = function(y) round(median(y),3)) # Merge df <- merge(merge(merge(output.summary.sdr,output.summary.auc, by = "EffectSize"),output.summary.fpr, by = "EffectSize"),output.summary.fdr, by = "EffectSize") - colnames(df) <- c("EffectSize","Spike.detect.rate","AUC","FPR","FDR") + colnames(df) <- c("EffectSize","Power","AUC","FPR","FDR") } } diff --git a/R/testDA.R b/R/testDA.R index 9062a7a..f963330 100644 --- a/R/testDA.R +++ b/R/testDA.R @@ -1,115 +1,23 @@ -#' Comparing differential abundance/expression methods by FPR and AUC +#' Comparing differential abundance/expression methods by Empirical power and False Discovery Rate #' -#' Calculating false positive rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods +#' Calculating Power, False Discovery Rates, False Positive Rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods #' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses -#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "anc", "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "ds2x", "lrm", "llm", "llm2", "lim", "lli", "lli2", "mva", and "zig" +#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} -#' @param R Integer. Number of times to run the tests. Default 10 -#' @param tests Character. Which tests to include. Default all (Except ANCOM and mvabund, see below for details) -#' @param relative Logical. If TRUE (default) abundances are made relative for "ttt", "ltt", "wil", "per", "aov", "lao", "kru", "lim", "lli", "lrm", "llm", "spe" and "pea", and there is an offset of \code{log(LibrarySize)} for "mva", "neb", "poi", "qpo", "zpo" and "znb" +#' @param R Integer. Number of times to run the tests. Default 20 +#' @param tests Character. Which tests to include. Default all +#' @param relative Logical. TRUE (default) for compositional data. FALSE for absolute abundances or pre-normalized data. #' @param effectSize Numeric. The effect size for the spike-ins. Default 5 #' @param k Vector of length 3. Number of Features to spike in each tertile (lower, mid, upper). E.g. \code{k=c(5,10,15)}: 5 features spiked in low abundance tertile, 10 features spiked in mid abundance tertile and 15 features spiked in high abundance tertile. Default NULL, which will spike 2 percent of the total amount of features in each tertile (a total of 6 percent), but minimum c(5,5,5) #' @param cores Integer. Number of cores to use for parallel computing. Default one less than available. Set to 1 for sequential computing. #' @param rng.seed Numeric. Seed for reproducibility. Default 123 #' @param p.adj Character. Method for p-value adjustment. See \code{p.adjust} for details. Default "fdr" #' @param args List. A list with lists of arguments passed to the different methods. See details for more. -#' @param out.all If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and lim, lli, lli2, lim, vli will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise -#' @param alpha q-value threshold for determining significance for \code{Spike.detect.rate}. Default 0.1 +#' @param out.all If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and limma will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise +#' @param alpha q-value threshold for determining significance for \code{Power}. Default 0.1 #' @param core.check If TRUE will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers. #' @param verbose If TRUE will print informative messages -#' @details Currently implemented methods: -#' \itemize{ -#' \item per - Permutation test with user defined test statistic -#' \item bay - baySeq -#' \item adx - ALDEx t-test and wilcoxon -#' \item wil - Wilcoxon Rank Sum on relative abundances -#' \item ttt - Welch t.test on relative abundances -#' \item ltt - Welch t.test, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item ltt2 - Welch t.test, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item neb - Negative binomial GLM with log of library size as offset -#' \item erq - EdgeR - Quasi likelihood - TMM normalization -#' \item ere - EdgeR - Exact test - TMM normalization -#' \item erq2 - EdgeR - Quasi likelihood - RLE normalization -#' \item ere2 - EdgeR - Exact test - RLE normalization -#' \item msf - MetagenomeSeq feature model -#' \item zig - MetagenomeSeq zero-inflated gaussian -#' \item ds2 - DESeq2 -#' \item ds2x - DESeq2 with manual geometric means -#' \item lim - LIMMA. Moderated linear models based on emperical bayes -#' \item lli - LIMMA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item lli2 - LIMMA, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item kru - Kruskal-Wallis on relative abundances -#' \item aov - ANOVA on relative abundances -#' \item lao - ANOVA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item lao2 - ANOVA, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item lrm - Linear regression on relative abundances -#' \item llm - Linear regression, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances -#' \item llm2 - Linear regression, but with relative abundances transformed with \code{log(relative abundance + delta2)} -#' \item rai - RAIDA -#' \item spe - Spearman correlation -#' \item pea - Pearson correlation -#' \item poi - Poisson GLM with log of library size as offset -#' \item qpo - Quasi-Poisson GLM with log of library size as offset -#' \item vli - Limma with voom -#' \item zpo - Zero-inflated Poisson GLM -#' \item znb - Zero-inflated Negative Binomial GLM -#' \item fri - Friedman Rank Sum test -#' \item qua - Quade test -#' \item anc - ANCOM (by default not included, as it is very slow) -#' \item sam - SAMSeq -#' \item mva - mvabund (by default not included, as it is very slow) -#' \item zzz - A user-defined method (See \code{?DA.zzz}) -#' } -#' "neb" can be slow if there is a paired argument. -#' -#' "anc" and "mva" are slow compared to the other methods. -#' -#' Additional arguments can be passed to the internal functions with the \code{args} argument. -#' It should be structured as a list with elements named by the tests: -#' E.g. passing to the \code{DA.per} function that it should only run 1000 iterations: \code{args = list(per=list(noOfIterations=1000))}. -#' Include that the log t.test should use a pseudocount of 0.1: \code{args = list(per=list(noOfIterations=1000), ltt=list(delta=0.1))}. -#' Additional arguments are simply seperated by commas. -#' -#' Below is an overview of which functions get the arguments that are passed to a specific test -#' \itemize{ -#' \item per - Passed to \code{DA.per} -#' \item bay - Passed to \code{getPriors}, \code{getLikelihoods} and \code{DA.bay} -#' \item adx - Passed to \code{aldex} and \code{DA.adx} -#' \item wil - Passed to \code{wilcox.test} and \code{DA.wil} -#' \item ttt - Passed to \code{t.test} and \code{DA.ttt} -#' \item ltt - Passed to \code{t.test} and \code{DA.ltt} -#' \item ltt2 - Passed to \code{t.test} and \code{DA.ltt2} -#' \item neb - Passed to \code{glm.nb}, \code{glmer.nb} and \code{DA.neb} -#' \item erq(2) - Passed to \code{calcNormFactors}, \code{estimateDisp}, \code{glmQLFit}, \code{glmQLFTest} and \code{DA.erq} -#' \item ere(2) - Passed to \code{calcNormFactors}, \code{estimateCommonDisp}, \code{estimateTagwiseDisp}, \code{exactTest} and \code{DA.ere} -#' \item msf - Passed to \code{fitFeatureModel} and \code{DA.msf} -#' \item zig - Passed to \code{fitZig} and \code{DA.zig} -#' \item ds2(x) - Passed to \code{DESeq} and \code{DA.ds2} -#' \item lim - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lim} -#' \item lli - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli} -#' \item lli2 - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli2} -#' \item kru - Passed to \code{kruskal.test} and \code{DA.kru} -#' \item aov - Passed to \code{aov} and \code{DA.aov} -#' \item lao - Passed to \code{aov} and \code{DA.lao} -#' \item lao2 - Passed to \code{aov} and \code{DA.lao2} -#' \item lrm - Passed to \code{lm}, \code{lme} and \code{DA.lrm} -#' \item llm - Passed to \code{lm}, \code{lme} and \code{DA.llm} -#' \item llm2 - Passed to \code{lm}, \code{lme} and \code{DA.llm2} -#' \item rai - Passed to \code{raida} and \code{DA.rai} -#' \item spe - Passed to \code{cor.test} and \code{DA.spe} -#' \item pea - Passed to \code{cor.test} and \code{DA.pea} -#' \item poi - Passed to \code{glm}, \code{glmer} and \code{DA.poi} -#' \item qpo - Passed to \code{glm} and \code{DA.qpo} -#' \item vli - Passed to \code{voom}, \code{eBayes}, \code{lmFit} and \code{DA.vli} -#' \item zpo - Passed to \code{zeroinfl} and \code{DA.zpo} -#' \item znb - Passed to \code{zeroinfl} and \code{DA.znb} -#' \item fri - Passed to \code{friedman.test} and \code{DA.fri} -#' \item qua - Passed to \code{quade.test} and \code{DA.qua} -#' \item anc - Passed to \code{ANCOM} and \code{DA.anc} -#' \item sam - Passed to \code{SAMseq} and \code{DA.sam} -#' \item mva - Passed to \code{manyglm} and \code{summary.manyglm} -#' } #' @return An object of class \code{DA}, which contains a list of results: #' \itemize{ #' \item table - FPR, AUC and spike detection rate for each run @@ -123,7 +31,7 @@ #' @importFrom pROC roc #' @export -testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests = c("neb","rai","per","bay","adx","sam","qua","fri","zpo","znb","vli","qpo","poi","pea","wil","ttt","ltt","ltt2","erq","erq2","ere","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe"), relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){ +testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests = c("mva","neb","rai","per","bay","adx","sam","qua","fri","zpo","znb","vli","qpo","poi","pea","wil","ttt","ltt","ltt2","erq","erq2","ere","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){ stopifnot(exists("data"),exists("predictor")) # Check for servers @@ -162,12 +70,18 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests if(sum(colSums(count_table) == 0) > 0) stop("Some samples are empty") if(ncol(count_table) != length(predictor)) stop("Number of samples in count_table does not match length of predictor") if(length(unique(predictor)) < 2) stop("predictor should have at least two levels") - + + # Remove Features not present in any samples + if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) + count_table <- count_table[rowSums(count_table) > 0,] + if(nrow(count_table) <= 15) warning("Dataset contains very few features") + # Prune tests argument - decimal <- FALSE + decimal <- zeroes <- FALSE if(!isTRUE(all(unlist(count_table) == floor(unlist(count_table))))) decimal <- TRUE + if(any(count_table == 0)) zeroes <- TRUE tests <- unique(tests) - if(!"zzz" %in% tests) tests <- prune.tests.DA(tests, predictor, paired, covars, relative, decimal) + if(!"zzz" %in% tests) tests <- prune.tests.DA(tests, predictor, paired, covars, relative, decimal, zeroes) tests.par <- paste0(unlist(lapply(1:R, function(x) rep(x,length(tests)))),"_",rep(tests,R)) if(length(tests) == 0) stop("No tests to run!") @@ -175,24 +89,16 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests if(verbose){ if("neb" %in% tests & !is.null(paired)){ message("As 'neb' is included and a 'paired' variable is supplied, this might take a long time") - } else { - if("anc" %in% tests){ - message("As 'anc' is included, this might take some time") - } } } # Set seed set.seed(rng.seed) if(verbose) message(paste("Seed is set to",rng.seed)) + if(verbose) message(paste("Running on",cores,"cores")) # Create some random seeds for each run - seeds <- rpois(R, lambda = (1:R)*1e6) - - # Remove Features not present in any samples - if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) - count_table <- count_table[rowSums(count_table) > 0,] - if(nrow(count_table) <= 15) warning("Dataset contains very few features") + seeds <- rng.seed+1:R # Spike vs no features if(is.null(k)){ @@ -222,6 +128,10 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests if(length(levels(as.factor(predictor))) > length(unique(predictor))) stop("predictor has more levels than unique values!") if(verbose) message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", "))) } + if(!is.null(paired)){ + if(verbose) message(paste("The paired variable has",length(unique(paired)),"levels")) + if(length(unique(paired)) < 5) warning("The paired variable has less than 5 levels. Mixed-effect models are excluded") + } # out.all if(is.null(out.all)){ @@ -242,6 +152,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests } } + if(verbose) cat("Spiking...\n") # Shuffle predictor if(is.null(paired)){ rands <- lapply(1:R,function(x) sample(predictor)) @@ -254,6 +165,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests count_tables <- lapply(1:R,function(x) spikeds[[x]][[1]]) ### Run tests + if(verbose) cat(paste("Testing", length(tests),"methods", R,"times each...\n")) # Progress bar pb <- txtProgressBar(max = length(tests.par), style = 3) progress <- function(n) setTxtProgressBar(pb, n) @@ -305,6 +217,8 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests wil = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, relative, p.adj),wil.DAargs)), ttt = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, relative, p.adj),ttt.DAargs)), ltt = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,relative, p.adj),ltt.DAargs)), + tta = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, p.adj),tta.DAargs)), + ttc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, p.adj),ttc.DAargs)), ltt2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, p.adj),ltt2.DAargs)), neb = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj),neb.DAargs)), erq = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),erq.DAargs)), @@ -316,17 +230,23 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests ds2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),ds2.DAargs)), ds2x = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),ds2x.DAargs)), per = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, relative, p.adj),per.DAargs)), - bay = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired, p.adj),bay.DAargs)), + bay = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]]),bay.DAargs)), adx = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]]),adx.DAargs)), lim = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj),lim.DAargs)), lli = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj),lli.DAargs)), + lia = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),lia.DAargs)), + lic = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),lic.DAargs)), lli2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),lli2.DAargs)), kru = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]], relative, p.adj),kru.DAargs)), aov = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, relative, p.adj),aov.DAargs)), lao = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars,relative, p.adj),lao.DAargs)), + aoa = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, p.adj),aoa.DAargs)), + aoc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, p.adj),aoc.DAargs)), lao2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars, p.adj),lao2.DAargs)), lrm = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars, relative,out.all, p.adj),lrm.DAargs)), llm = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,relative,out.all, p.adj),llm.DAargs)), + lma = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),lma.DAargs)), + lmc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),lmc.DAargs)), llm2 = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,covars,out.all, p.adj),llm2.DAargs)), rai = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]], p.adj),rai.DAargs)), spe = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],relative, p.adj),spe.DAargs)), @@ -338,12 +258,11 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests znb = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],covars,relative,out.all, p.adj),znb.DAargs)), fri = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,relative, p.adj),fri.DAargs)), qua = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,relative, p.adj),qua.DAargs)), - anc = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,sig = alpha),anc.DAargs)), sam = do.call(get(noquote(paste0("DA.",i))),c(list(count_tables[[run.no]],rands[[run.no]],paired,fdr.output = alpha),sam.DAargs))), error = function(e) NULL) - if(!is.null(res.sub) & (!i %in% c("anc","sam","adx"))){ + if(!is.null(res.sub) & (!i %in% c("sam","adx"))){ res.sub[is.na(res.sub$pval),"pval"] <- 1 res.sub[is.na(res.sub$pval.adj),"pval.adj"] <- 1 } @@ -402,20 +321,6 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests names(res.sub) <- c(res.names,paste0(r,"_","adx.t"),paste0(r,"_","adx.w")) } - # Make pseudo-pval for ANCOM - if("anc" %in% gsub(".*_","",names(res.sub))){ - anc <- as.data.frame(res.sub[paste0(r,"_","anc")]) - colnames(anc) <- gsub(".*_anc.","",colnames(anc)) - suppressWarnings(min.d <- min(anc[anc$Detected == "Yes","W"])) - if(min.d == Inf) min.d <- max(anc$W)+1 - anc$pval <- 1/(anc$W+1) * 0.05/(1/(min.d+1)) - anc$pval.adj <- anc$pval - res.sub[paste0(r,"_","anc")] <- NULL - res.names <- names(res.sub) - res.sub <- c(res.sub,list(anc)) - names(res.sub) <- c(res.names,paste0(r,"_","anc")) - } - # Make pseudo-pval for SAMseq if("sam" %in% gsub(".*_","",names(res.sub))){ samdf <- as.data.frame(res.sub[paste0(r,"_","sam")]) @@ -501,20 +406,15 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests AUC = aucs, FPR = fprs, FDR = fdrs, - Spike.detect.rate = sdrs, + Power = sdrs, Run = r) rownames(df.combined) <- NULL - # SAMseq and ANCOM FPRs not possible + # SAMseq FPRs not possible if("sam" %in% newnames){ df.combined[df.combined$Method == "SAMseq (sam)","FPR"] <- NA } - if("anc" %in% newnames){ - df.combined[df.combined$Method == "ANCOM (anc)","FPR"] <- NA - } - - return(list(df.combined,res.sub)) - + return(list(df.combined,res.sub)) } output.results <- do.call(rbind,lapply(final.results, function(x) x[[1]])) diff --git a/R/vennDA.R b/R/vennDA.R index d5c0a4c..890c971 100644 --- a/R/vennDA.R +++ b/R/vennDA.R @@ -31,8 +31,8 @@ vennDA <- function(x, tests = NULL, alpha = 0.1, split = FALSE, output = FALSE, featurelist <- list() for(i in seq_along(plottests)){ sub <- x$adj[,c("Feature",plottests[i])] - if(!plottests[i] %in% c("sam","anc")) featurelist[[i]] <- sub[sub[,2] < alpha,"Feature"] - if(plottests[i] %in% c("sam","anc")) featurelist[[i]] <- sub[sub[,2] != "No","Feature"] + if(!plottests[i] %in% c("sam")) featurelist[[i]] <- sub[sub[,2] < alpha,"Feature"] + if(plottests[i] %in% c("sam")) featurelist[[i]] <- sub[sub[,2] != "No","Feature"] } # Split in negative and positive significant @@ -49,7 +49,7 @@ vennDA <- function(x, tests = NULL, alpha = 0.1, split = FALSE, output = FALSE, featurelist.pos[[i]] <- featurelist[[i]][featurelist[[i]] %in% sub.p] featurelist.neg[[i]] <- featurelist[[i]][featurelist[[i]] %in% sub.n] } - if(plottests[i] %in% c("mva","sam","znb","zpo","poi","qpo","neb","lrm","llm","llm2","lim","lli","lli2","vli","pea","spe","per","adx.t","adx.w","wil","ttt","ltt","ltt2","ere","ere2","erq","erq2","ds2","ds2x","msf","zig","rai")){ + if(plottests[i] %in% c("mva","sam","znb","zpo","poi","qpo","neb","lim","lli","lli2","vli","lia","lic","pea","spe","per","adx.t","adx.w","wil","ttt","ltt","ltt2","tta","ttc","ere","ere2","erq","erq2","ds2","ds2x","msf","zig","rai")){ if(is.null(ncol(subs))){ featurelist.pos[[i]] <- featurelist[[i]] featurelist.neg[[i]] <- featurelist[[i]] @@ -60,8 +60,8 @@ vennDA <- function(x, tests = NULL, alpha = 0.1, split = FALSE, output = FALSE, featurelist.neg[[i]] <- featurelist[[i]][featurelist[[i]] %in% sub.n] } } - # If not estimate/logFC provided throw all significant in both positive and negative list - if(!plottests[i] %in% c("mva","sam","bay","znb","zpo","poi","qpo","neb","lrm","llm","llm2","lim","lli","lli2","vli","pea","spe","per","adx.t","adx.w","wil","ttt","ltt","ltt2","ere","ere2","erq","erq2","ds2","ds2x","msf","zig","rai")){ + # If no estimate/logFC provided throw all significant in both positive and negative list + if(!plottests[i] %in% c("mva","sam","bay","znb","zpo","poi","qpo","neb","lim","lli","lli2","vli","lia","lic","pea","spe","per","adx.t","adx.w","wil","ttt","ltt","ltt2","tta","ttc","ere","ere2","erq","erq2","ds2","ds2x","msf","zig","rai")){ featurelist.pos[[i]] <- featurelist[[i]] featurelist.neg[[i]] <- featurelist[[i]] } @@ -108,12 +108,12 @@ vennDA <- function(x, tests = NULL, alpha = 0.1, split = FALSE, output = FALSE, # Remove the duplicate ones created earlier for methods without estimates/logFC if(split){ for(i in seq_along(plottests)){ - if(!plottests[i] %in% c("mva","sam","bay","znb","zpo","poi","qpo","neb","lrm","llm","llm2","lim","lli","lli2","vli","pea","spe","per","adx.t","adx.w","wil","ttt","ltt","ltt2","ere","ere2","erq","erq2","ds2","ds2x","msf","zig","rai")){ + if(!plottests[i] %in% c("mva","sam","bay","znb","zpo","poi","qpo","neb","lim","lli","lli2","vli","lia","lic","pea","spe","per","adx.t","adx.w","wil","ttt","ltt","ltt2","tta","ttc","ere","ere2","erq","erq2","ds2","ds2x","msf","zig","rai")){ venndf <- venndf[venndf$vennname != paste0(plottests[i],"_Negative"),] venndf$vennname <- as.character(venndf$vennname) venndf[venndf$vennname == paste0(plottests[i],"_Positive"),"vennname"] <- plottests[i] } - if(plottests[i] %in% c("znb","zpo","poi","qpo","neb","lrm","llm","llm2") & !plottests[i] %in% gsub("_.*","",colnames(x$est))){ + if(plottests[i] %in% c("znb","zpo","poi","qpo","neb") & !plottests[i] %in% gsub("_.*","",colnames(x$est))){ venndf <- venndf[venndf$vennname != paste0(plottests[i],"_Negative"),] venndf$vennname <- as.character(venndf$vennname) venndf[venndf$vennname == paste0(plottests[i],"_Positive"),"vennname"] <- plottests[i] diff --git a/R/zzz.R b/R/zzz.R index 0aca9ce..536d98f 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,5 +1,5 @@ .onLoad <- function(libname, pkgname){ - message("DAtest version 2.7.11") + message("DAtest version 2.7.12") } diff --git a/README.md b/README.md index 460822d..75e90a6 100644 --- a/README.md +++ b/README.md @@ -301,7 +301,7 @@ p-values are expected to be below 0.05. We therefore want an FPR at 0.05 or lower. FDR indicates the proportion of significant features (after multiple -correction) that we're not spiked and therefore shouldn't be +correction) that were not spiked and therefore shouldn't be significant. This should be as low as possible. AUC is estimated by ranking all features by their respective p-value, diff --git a/man/DA.TukeyHSD.Rd b/man/DA.TukeyHSD.Rd index e6d43d5..c291287 100644 --- a/man/DA.TukeyHSD.Rd +++ b/man/DA.TukeyHSD.Rd @@ -16,5 +16,5 @@ DA.TukeyHSD(results, variable = "predictor", p.adj = "fdr", ...) \item{...}{Additional arguments for \code{TukeyHSD} function} } \description{ -Works on "aov", "lao", "lao2" +Works on "aov", "lao", "lao2", "aoc", "aoa" } diff --git a/man/DA.anc.Rd b/man/DA.anc.Rd deleted file mode 100644 index 7e761ee..0000000 --- a/man/DA.anc.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/DA.anc.R -\name{DA.anc} -\alias{DA.anc} -\title{ANCOM} -\usage{ -DA.anc(data, predictor, paired = NULL, allResults = FALSE, ...) -} -\arguments{ -\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} - -\item{predictor}{The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} - -\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} - -\item{allResults}{If TRUE will return raw results from the \code{ANCOM} function} - -\item{...}{Additional arguments for the \code{ANCOM} function} -} -\description{ -Implementation of ANCOM for \code{DAtest} -} diff --git a/man/DA.anova.Rd b/man/DA.anova.Rd index cbde597..51f96a0 100644 --- a/man/DA.anova.Rd +++ b/man/DA.anova.Rd @@ -14,5 +14,5 @@ DA.anova(results, p.adj = "fdr", ...) \item{...}{Additional arguments for \code{anova} function} } \description{ -Works on "lrm", "llm", "llm2". Non-paired "neb" +Works on "lrm", "llm", "llm2", "lma", "lmc". Non-paired "neb" } diff --git a/man/DA.aoa.Rd b/man/DA.aoa.Rd new file mode 100644 index 0000000..661ccca --- /dev/null +++ b/man/DA.aoa.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.aoa.R +\name{DA.aoa} +\alias{DA.aoa} +\title{ANOVA - Multiplicative zero-correction and additive log-ratio normalization.} +\usage{ +DA.aoa(data, predictor, covars = NULL, p.adj = "fdr", delta = 1, + allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{covars}{Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount for zero-correction. Default 1} + +\item{allResults}{If TRUE will return raw results from the \code{aov} function} + +\item{...}{Additional arguments for the \code{aov} functions} +} +\description{ +Apply ANOVA on multiple features with one \code{predictor}. +} +\details{ +Note: Last feature in the data is used as reference for the log-ratio transformation. +} diff --git a/man/DA.aoc.Rd b/man/DA.aoc.Rd new file mode 100644 index 0000000..c7f6842 --- /dev/null +++ b/man/DA.aoc.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.aoc.R +\name{DA.aoc} +\alias{DA.aoc} +\title{ANOVA - Multiplicative zero-correction and additive log-ratio normalization.} +\usage{ +DA.aoc(data, predictor, covars = NULL, p.adj = "fdr", delta = 1, + allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{covars}{Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount for zero-correction. Default 1} + +\item{allResults}{If TRUE will return raw results from the \code{aov} function} + +\item{...}{Additional arguments for the \code{aov} functions} +} +\description{ +Apply ANOVA on multiple features with one \code{predictor}. +} diff --git a/man/DA.aov.Rd b/man/DA.aov.Rd index 0c45e4d..87d4519 100644 --- a/man/DA.aov.Rd +++ b/man/DA.aov.Rd @@ -4,8 +4,8 @@ \alias{DA.aov} \title{ANOVA} \usage{ -DA.aov(data, predictor, covars = NULL, relative = TRUE, p.adj = "fdr", - allResults = FALSE, ...) +DA.aov(data, predictor, covars = NULL, relative = TRUE, + p.adj = "fdr", allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.bay.Rd b/man/DA.bay.Rd index 5ab69ed..949750b 100644 --- a/man/DA.bay.Rd +++ b/man/DA.bay.Rd @@ -4,15 +4,13 @@ \alias{DA.bay} \title{baySeq} \usage{ -DA.bay(data, predictor, p.adj = "fdr", allResults = FALSE, ...) +DA.bay(data, predictor, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} \item{predictor}{The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} -\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} - \item{allResults}{If TRUE will return raw results from the \code{getLikelihoods} function} \item{...}{Additional arguments to the \code{getPriors.NB} and \code{getLikelihoods} functions} diff --git a/man/DA.drop1.Rd b/man/DA.drop1.Rd index fbe110b..1115f5b 100644 --- a/man/DA.drop1.Rd +++ b/man/DA.drop1.Rd @@ -16,5 +16,5 @@ DA.drop1(results, test = "Chisq", p.adj = "fdr", ...) \item{...}{Additional arguments for \code{drop1} function} } \description{ -Works on "zpo", "znb", "qpo", "neb", "poi". Non-paired "lrm", "llm", "llm2" +Works on "zpo", "znb", "qpo", "neb", "poi". Non-paired "lrm", "llm", "llm2", "lma", "lmc" } diff --git a/man/DA.ds2x.Rd b/man/DA.ds2x.Rd index a735b20..dff02f7 100644 --- a/man/DA.ds2x.Rd +++ b/man/DA.ds2x.Rd @@ -4,8 +4,9 @@ \alias{DA.ds2x} \title{DESeq2} \usage{ -DA.ds2x(data, predictor, paired = NULL, covars = NULL, out.all = NULL, - p.adj = "fdr", coeff = 2, coeff.ref = 1, allResults = FALSE, ...) +DA.ds2x(data, predictor, paired = NULL, covars = NULL, + out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, + allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.erq2.Rd b/man/DA.erq2.Rd index dec7573..b62ca9b 100644 --- a/man/DA.erq2.Rd +++ b/man/DA.erq2.Rd @@ -4,8 +4,9 @@ \alias{DA.erq2} \title{EdgeR quasi-likelihood - RLE normalization} \usage{ -DA.erq2(data, predictor, paired = NULL, covars = NULL, out.all = NULL, - p.adj = "fdr", coeff = 2, allResults = FALSE, ...) +DA.erq2(data, predictor, paired = NULL, covars = NULL, + out.all = NULL, p.adj = "fdr", coeff = 2, allResults = FALSE, + ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.fri.Rd b/man/DA.fri.Rd index 902fb04..aa46889 100644 --- a/man/DA.fri.Rd +++ b/man/DA.fri.Rd @@ -4,8 +4,8 @@ \alias{DA.fri} \title{Friedman Rank Sum test} \usage{ -DA.fri(data, predictor, paired = NULL, relative = TRUE, p.adj = "fdr", - allResults = FALSE, ...) +DA.fri(data, predictor, paired = NULL, relative = TRUE, + p.adj = "fdr", allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.lao.Rd b/man/DA.lao.Rd index e210b53..227de8b 100644 --- a/man/DA.lao.Rd +++ b/man/DA.lao.Rd @@ -4,8 +4,8 @@ \alias{DA.lao} \title{ANOVA} \usage{ -DA.lao(data, predictor, covars = NULL, relative = TRUE, p.adj = "fdr", - delta = 1, allResults = FALSE, ...) +DA.lao(data, predictor, covars = NULL, relative = TRUE, + p.adj = "fdr", delta = 1, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.lia.Rd b/man/DA.lia.Rd new file mode 100644 index 0000000..6dfd46a --- /dev/null +++ b/man/DA.lia.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.lia.R +\name{DA.lia} +\alias{DA.lia} +\title{LIMMA - Multiplicative zero-correction and additive log-ratio normalization.} +\usage{ +DA.lia(data, predictor, paired = NULL, covars = NULL, out.all = NULL, + p.adj = "fdr", delta = 1, coeff = 2, allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{covars}{Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} + +\item{out.all}{If TRUE will output results from F-tests, if FALSE t-statistic results from 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount zero-correction. Default 1} + +\item{coeff}{Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}.} + +\item{allResults}{If TRUE will return raw results from the \code{eBayes} function} + +\item{...}{Additional arguments for the \code{eBayes} and \code{lmFit} functions} +} +\description{ +Note: Last feature in the data is used as reference for the log-ratio transformation. +} diff --git a/man/DA.lic.Rd b/man/DA.lic.Rd new file mode 100644 index 0000000..4e5fcd5 --- /dev/null +++ b/man/DA.lic.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.lic.R +\name{DA.lic} +\alias{DA.lic} +\title{LIMMA - Multiplicative zero-correction and center log-ratio normalization.} +\usage{ +DA.lic(data, predictor, paired = NULL, covars = NULL, out.all = NULL, + p.adj = "fdr", delta = 1, coeff = 2, allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{covars}{Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} + +\item{out.all}{If TRUE will output results from F-tests, if FALSE t-statistic results from 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount zero-correction. Default 1} + +\item{coeff}{Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}.} + +\item{allResults}{If TRUE will return raw results from the \code{eBayes} function} + +\item{...}{Additional arguments for the \code{eBayes} and \code{lmFit} functions} +} +\description{ +Note: Last feature in the data is used as reference for the log-ratio transformation. +} diff --git a/man/DA.lim.Rd b/man/DA.lim.Rd index 21e76c2..28354de 100644 --- a/man/DA.lim.Rd +++ b/man/DA.lim.Rd @@ -4,8 +4,9 @@ \alias{DA.lim} \title{LIMMA} \usage{ -DA.lim(data, predictor, paired = NULL, covars = NULL, relative = TRUE, - out.all = NULL, p.adj = "fdr", coeff = 2, allResults = FALSE, ...) +DA.lim(data, predictor, paired = NULL, covars = NULL, + relative = TRUE, out.all = NULL, p.adj = "fdr", coeff = 2, + allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.lli.Rd b/man/DA.lli.Rd index 9e7e4c3..78e8e84 100644 --- a/man/DA.lli.Rd +++ b/man/DA.lli.Rd @@ -4,9 +4,9 @@ \alias{DA.lli} \title{LIMMA} \usage{ -DA.lli(data, predictor, paired = NULL, covars = NULL, relative = TRUE, - out.all = NULL, p.adj = "fdr", delta = 1, coeff = 2, - allResults = FALSE, ...) +DA.lli(data, predictor, paired = NULL, covars = NULL, + relative = TRUE, out.all = NULL, p.adj = "fdr", delta = 1, + coeff = 2, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.lli2.Rd b/man/DA.lli2.Rd index 9835f2d..17ac916 100644 --- a/man/DA.lli2.Rd +++ b/man/DA.lli2.Rd @@ -4,8 +4,9 @@ \alias{DA.lli2} \title{LIMMA} \usage{ -DA.lli2(data, predictor, paired = NULL, covars = NULL, out.all = NULL, - p.adj = "fdr", delta = 0.001, coeff = 2, allResults = FALSE, ...) +DA.lli2(data, predictor, paired = NULL, covars = NULL, + out.all = NULL, p.adj = "fdr", delta = 0.001, coeff = 2, + allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.llm.Rd b/man/DA.llm.Rd index c57bb20..a0da332 100644 --- a/man/DA.llm.Rd +++ b/man/DA.llm.Rd @@ -4,8 +4,8 @@ \alias{DA.llm} \title{Log linear regression} \usage{ -DA.llm(data, predictor, paired = NULL, covars = NULL, relative = TRUE, - out.all = NULL, p.adj = "fdr", delta = 1, coeff = 2, coeff.ref = 1, +DA.llm(data, predictor, paired = NULL, covars = NULL, + relative = TRUE, out.all = NULL, p.adj = "fdr", delta = 1, allResults = FALSE, ...) } \arguments{ @@ -25,10 +25,6 @@ DA.llm(data, predictor, paired = NULL, covars = NULL, relative = TRUE, \item{delta}{Numeric. Pseudocount for the log transformation. Default 1} -\item{coeff}{Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}.} - -\item{coeff.ref}{Integer. Reference level of the \code{predictor}. Will only affect the log2FC and ordering columns on the output. Default the intercept, = 1} - \item{allResults}{If TRUE will return raw results from the \code{lm}/\code{lme} function} \item{...}{Additional arguments for the \code{lm}/\code{lme} functions} diff --git a/man/DA.llm2.Rd b/man/DA.llm2.Rd index 3fc7f3a..7f01dc6 100644 --- a/man/DA.llm2.Rd +++ b/man/DA.llm2.Rd @@ -4,9 +4,9 @@ \alias{DA.llm2} \title{Log linear regression} \usage{ -DA.llm2(data, predictor, paired = NULL, covars = NULL, out.all = NULL, - p.adj = "fdr", delta = 0.001, coeff = 2, coeff.ref = 1, - allResults = FALSE, ...) +DA.llm2(data, predictor, paired = NULL, covars = NULL, + out.all = NULL, p.adj = "fdr", delta = 0.001, allResults = FALSE, + ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} @@ -23,10 +23,6 @@ DA.llm2(data, predictor, paired = NULL, covars = NULL, out.all = NULL, \item{delta}{Numeric. Pseudocount for the log transformation. Default 0.001} -\item{coeff}{Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}.} - -\item{coeff.ref}{Integer. Reference level of the \code{predictor}. Will only affect the log2FC and ordering columns on the output. Default the intercept, = 1} - \item{allResults}{If TRUE will return raw results from the \code{lm}/\code{lme} function} \item{...}{Additional arguments for the \code{lm}/\code{lme} functions} diff --git a/man/DA.lma.Rd b/man/DA.lma.Rd new file mode 100644 index 0000000..3049bf3 --- /dev/null +++ b/man/DA.lma.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.lma.R +\name{DA.lma} +\alias{DA.lma} +\title{Linear regression - Multiplicative zero-correction and additive log-ratio normalization.} +\usage{ +DA.lma(data, predictor, paired = NULL, covars = NULL, out.all = NULL, + p.adj = "fdr", delta = 1, allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{covars}{Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} + +\item{out.all}{If TRUE will output results and p-values from \code{anova}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount for zero-correction. Default 1} + +\item{allResults}{If TRUE will return raw results from the \code{lm}/\code{lme} function} + +\item{...}{Additional arguments for the \code{lm}/\code{lme} functions} +} +\description{ +Apply linear regression for multiple features with one \code{predictor}. +Mixed-effect model is used when a \code{paired} argument is included, with the \code{paired} variable as a random intercept. +Note: Last feature in the data is used as reference for the log-ratio transformation. +} diff --git a/man/DA.lmc.Rd b/man/DA.lmc.Rd new file mode 100644 index 0000000..2ef9281 --- /dev/null +++ b/man/DA.lmc.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.lmc.R +\name{DA.lmc} +\alias{DA.lmc} +\title{Linear regression - Multiplicative zero-correction and center log-ratio normalization.} +\usage{ +DA.lmc(data, predictor, paired = NULL, covars = NULL, out.all = NULL, + p.adj = "fdr", delta = 1, allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{covars}{Either a named list with covariables, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} + +\item{out.all}{If TRUE will output results and p-values from \code{anova}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount for zero-correction. Default 1} + +\item{allResults}{If TRUE will return raw results from the \code{lm}/\code{lme} function} + +\item{...}{Additional arguments for the \code{lm}/\code{lme} functions} +} +\description{ +Apply linear regression for multiple features with one \code{predictor}. +Mixed-effect model is used when a \code{paired} argument is included, with the \code{paired} variable as a random intercept. +} diff --git a/man/DA.lrm.Rd b/man/DA.lrm.Rd index 0b4764a..38774bf 100644 --- a/man/DA.lrm.Rd +++ b/man/DA.lrm.Rd @@ -4,8 +4,8 @@ \alias{DA.lrm} \title{Linear regression} \usage{ -DA.lrm(data, predictor, paired = NULL, covars = NULL, relative = TRUE, - out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, +DA.lrm(data, predictor, paired = NULL, covars = NULL, + relative = TRUE, out.all = NULL, p.adj = "fdr", allResults = FALSE, ...) } \arguments{ @@ -23,10 +23,6 @@ DA.lrm(data, predictor, paired = NULL, covars = NULL, relative = TRUE, \item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} -\item{coeff}{Integer. The p-value and log2FoldChange will be associated with this coefficient. Default 2, i.e. the 2. level of the \code{predictor}.} - -\item{coeff.ref}{Integer. Reference level of the \code{predictor}. Will only affect the log2FC and ordering columns on the output. Default the intercept, = 1} - \item{allResults}{If TRUE will return raw results from the \code{lm}/\code{lme} function} \item{...}{Additional arguments for the \code{lm}/\code{lme} functions} diff --git a/man/DA.lsmeans.Rd b/man/DA.lsmeans.Rd index 3b51269..20840db 100644 --- a/man/DA.lsmeans.Rd +++ b/man/DA.lsmeans.Rd @@ -12,16 +12,16 @@ DA.lsmeans(results, variable = "predictor", predictor = NULL, \item{variable}{Which variable to test. Default predictor. Alternatively, the name of a covar} -\item{predictor}{If results come from a paired "lrm", "llm" or "llm2" supply the original predictor variable in the form of as a vector} +\item{predictor}{If results come from a paired "lrm", "llm", "lma", "lmc" or "llm2" supply the original predictor variable in the form of as a vector} -\item{covars}{If results come from a paired "lrm", "llm" or "llm2" supply the original covars in the form of a named list} +\item{covars}{If results come from a paired "lrm", "lma", "lmc", "llm" or "llm2" supply the original covars in the form of a named list} \item{p.adj}{P-value adjustment method. See \code{p.adjust} for details} \item{...}{Additional arguments for \code{lsmeans} function} } \description{ -Pairwise testing on predictor and covars. Works on "poi", "neb", "lrm", "llm", "llm2", "qpo", "znb", "zpo". +Pairwise testing on predictor and covars. Works on "poi", "neb", "lrm", "lma", "lmc", "llm", "llm2", "qpo", "znb", "zpo". } \details{ Require the \code{lsmeans} package diff --git a/man/DA.ltt.Rd b/man/DA.ltt.Rd index e9991dd..5b1ec99 100644 --- a/man/DA.ltt.Rd +++ b/man/DA.ltt.Rd @@ -4,10 +4,11 @@ \alias{DA.ltt} \title{Welch t-test} \usage{ -DA.ltt(data, predictor, paired = NULL, relative = TRUE, p.adj = "fdr", - delta = 1, testStat = function(case, control) { log2((mean(case) + - 0.001)/(mean(control) + 0.001)) }, testStat.pair = function(case, control) { - log2(mean((case + 0.001)/(control + 0.001))) }, allResults = FALSE, ...) +DA.ltt(data, predictor, paired = NULL, relative = TRUE, + p.adj = "fdr", delta = 1, testStat = function(case, control) { + log2((mean(case) + 0.001)/(mean(control) + 0.001)) }, + testStat.pair = function(case, control) { log2(mean((case + + 0.001)/(control + 0.001))) }, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.mva.Rd b/man/DA.mva.Rd index ccd4544..5f92c0c 100644 --- a/man/DA.mva.Rd +++ b/man/DA.mva.Rd @@ -4,9 +4,9 @@ \alias{DA.mva} \title{Mvabund} \usage{ -DA.mva(data, predictor, paired = NULL, covars = NULL, relative = TRUE, - p.adj = "fdr", coeff = 2, coeff.ref = 1, resamp = "montecarlo", - allResults = FALSE, ...) +DA.mva(data, predictor, paired = NULL, covars = NULL, + relative = TRUE, p.adj = "fdr", coeff = 2, coeff.ref = 1, + resamp = "montecarlo", allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.neb.Rd b/man/DA.neb.Rd index 7325786..bec8b95 100644 --- a/man/DA.neb.Rd +++ b/man/DA.neb.Rd @@ -4,9 +4,9 @@ \alias{DA.neb} \title{Negative binomial glm} \usage{ -DA.neb(data, predictor, paired = NULL, covars = NULL, relative = TRUE, - out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, - allResults = FALSE, ...) +DA.neb(data, predictor, paired = NULL, covars = NULL, + relative = TRUE, out.all = NULL, p.adj = "fdr", coeff = 2, + coeff.ref = 1, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.per.Rd b/man/DA.per.Rd index af7f5c8..68f83d4 100644 --- a/man/DA.per.Rd +++ b/man/DA.per.Rd @@ -4,11 +4,11 @@ \alias{DA.per} \title{Permutation test of user-defined test statistic} \usage{ -DA.per(data, predictor, paired = NULL, relative = TRUE, p.adj = "fdr", - testStat = function(case, control) { log2((mean(case) + - 1)/(mean(control) + 1)) }, testStat.pair = function(case, control) { - log2(mean((case + 1)/(control + 1))) }, noOfIterations = 10000, - margin = 50) +DA.per(data, predictor, paired = NULL, relative = TRUE, + p.adj = "fdr", testStat = function(case, control) { + log2((mean(case) + 1)/(mean(control) + 1)) }, + testStat.pair = function(case, control) { log2(mean((case + + 1)/(control + 1))) }, noOfIterations = 10000, margin = 50) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.poi.Rd b/man/DA.poi.Rd index c5227d6..e25d885 100644 --- a/man/DA.poi.Rd +++ b/man/DA.poi.Rd @@ -4,9 +4,9 @@ \alias{DA.poi} \title{Poisson glm} \usage{ -DA.poi(data, predictor, paired = NULL, covars = NULL, relative = TRUE, - out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, - allResults = FALSE, ...) +DA.poi(data, predictor, paired = NULL, covars = NULL, + relative = TRUE, out.all = NULL, p.adj = "fdr", coeff = 2, + coeff.ref = 1, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.qpo.Rd b/man/DA.qpo.Rd index 6ac4d2a..3bb3d03 100644 --- a/man/DA.qpo.Rd +++ b/man/DA.qpo.Rd @@ -4,8 +4,9 @@ \alias{DA.qpo} \title{Quasi-poisson glm} \usage{ -DA.qpo(data, predictor, covars = NULL, relative = TRUE, out.all = NULL, - p.adj = "fdr", coeff = 2, coeff.ref = 1, allResults = FALSE, ...) +DA.qpo(data, predictor, covars = NULL, relative = TRUE, + out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, + allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.qua.Rd b/man/DA.qua.Rd index a129ad4..5dfb2ac 100644 --- a/man/DA.qua.Rd +++ b/man/DA.qua.Rd @@ -4,8 +4,8 @@ \alias{DA.qua} \title{Quade test} \usage{ -DA.qua(data, predictor, paired = NULL, relative = TRUE, p.adj = "fdr", - allResults = FALSE, ...) +DA.qua(data, predictor, paired = NULL, relative = TRUE, + p.adj = "fdr", allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.tta.Rd b/man/DA.tta.Rd new file mode 100644 index 0000000..c99bf70 --- /dev/null +++ b/man/DA.tta.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.tta.R +\name{DA.tta} +\alias{DA.tta} +\title{Welch t-test - Multiplicative zero-correction and additive log-ratio normalization.} +\usage{ +DA.tta(data, predictor, paired = NULL, p.adj = "fdr", delta = 1, + testStat = function(case, control) { mean(case) - mean(control) }, + testStat.pair = function(case, control) { mean(case - control) }, + allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if data is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount for zero-correction.} + +\item{testStat}{Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances)-mean(control abundances)}} + +\item{testStat.pair}{Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances-control abundances)}} + +\item{allResults}{If TRUE will return raw results from the \code{t.test} function} + +\item{...}{Additional arguments for the \code{t.test} function} +} +\description{ +Apply welch t-test to multiple features with one \code{predictor} +} +\details{ +Note: Last feature in the data is used as reference for the log-ratio transformation. +} diff --git a/man/DA.ttc.Rd b/man/DA.ttc.Rd new file mode 100644 index 0000000..a77fc64 --- /dev/null +++ b/man/DA.ttc.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DA.ttc.R +\name{DA.ttc} +\alias{DA.ttc} +\title{Welch t-test - Multiplicative zero-correction and center log-ratio normalization.} +\usage{ +DA.ttc(data, predictor, paired = NULL, p.adj = "fdr", delta = 1, + testStat = function(case, control) { mean(case) - mean(control) }, + testStat.pair = function(case, control) { mean(case - control) }, + allResults = FALSE, ...) +} +\arguments{ +\item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} + +\item{predictor}{The predictor of interest. Factor, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if data is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation} + +\item{p.adj}{Character. P-value adjustment. Default "fdr". See \code{p.adjust} for details} + +\item{delta}{Numeric. Pseudocount for zero-correction.} + +\item{testStat}{Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances)-mean(control abundances)}} + +\item{testStat.pair}{Function. Function for calculating fold change. Should take two vectors as arguments. Default is a simple difference: \code{mean(case abundances-control abundances)}} + +\item{allResults}{If TRUE will return raw results from the \code{t.test} function} + +\item{...}{Additional arguments for the \code{t.test} function} +} +\description{ +Apply welch t-test to multiple features with one \code{predictor} +} diff --git a/man/DA.ttt.Rd b/man/DA.ttt.Rd index fb425a4..42fa9f7 100644 --- a/man/DA.ttt.Rd +++ b/man/DA.ttt.Rd @@ -4,10 +4,11 @@ \alias{DA.ttt} \title{Welch t-test} \usage{ -DA.ttt(data, predictor, paired = NULL, relative = TRUE, p.adj = "fdr", - testStat = function(case, control) { log2((mean(case) + - 0.001)/(mean(control) + 0.001)) }, testStat.pair = function(case, control) { - log2(mean((case + 0.001)/(control + 0.001))) }, allResults = FALSE, ...) +DA.ttt(data, predictor, paired = NULL, relative = TRUE, + p.adj = "fdr", testStat = function(case, control) { + log2((mean(case) + 0.001)/(mean(control) + 0.001)) }, + testStat.pair = function(case, control) { log2(mean((case + + 0.001)/(control + 0.001))) }, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.wil.Rd b/man/DA.wil.Rd index 962bf79..86b8469 100644 --- a/man/DA.wil.Rd +++ b/man/DA.wil.Rd @@ -4,10 +4,11 @@ \alias{DA.wil} \title{Wilcoxon Rank Sum and Signed Rank Test} \usage{ -DA.wil(data, predictor, paired = NULL, relative = TRUE, p.adj = "fdr", - testStat = function(case, control) { log2((mean(case) + - 0.001)/(mean(control) + 0.001)) }, testStat.pair = function(case, control) { - log2(mean((case + 0.001)/(control + 0.001))) }, allResults = FALSE, ...) +DA.wil(data, predictor, paired = NULL, relative = TRUE, + p.adj = "fdr", testStat = function(case, control) { + log2((mean(case) + 0.001)/(mean(control) + 0.001)) }, + testStat.pair = function(case, control) { log2(mean((case + + 0.001)/(control + 0.001))) }, allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.znb.Rd b/man/DA.znb.Rd index 1f4b89a..b01c036 100644 --- a/man/DA.znb.Rd +++ b/man/DA.znb.Rd @@ -4,8 +4,9 @@ \alias{DA.znb} \title{Zero inflated Negative Binomial glm} \usage{ -DA.znb(data, predictor, covars = NULL, relative = TRUE, out.all = NULL, - p.adj = "fdr", coeff = 2, coeff.ref = 1, allResults = FALSE, ...) +DA.znb(data, predictor, covars = NULL, relative = TRUE, + out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, + allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/DA.zpo.Rd b/man/DA.zpo.Rd index 50ccb54..d0350c2 100644 --- a/man/DA.zpo.Rd +++ b/man/DA.zpo.Rd @@ -4,8 +4,9 @@ \alias{DA.zpo} \title{Zero inflated Poisson glm} \usage{ -DA.zpo(data, predictor, covars = NULL, relative = TRUE, out.all = NULL, - p.adj = "fdr", coeff = 2, coeff.ref = 1, allResults = FALSE, ...) +DA.zpo(data, predictor, covars = NULL, relative = TRUE, + out.all = NULL, p.adj = "fdr", coeff = 2, coeff.ref = 1, + allResults = FALSE, ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples} diff --git a/man/allDA.Rd b/man/allDA.Rd index cba595f..55e5dc8 100644 --- a/man/allDA.Rd +++ b/man/allDA.Rd @@ -4,11 +4,12 @@ \alias{allDA} \title{Run many differential abundance/expression methods} \usage{ -allDA(data, predictor, paired = NULL, covars = NULL, tests = c("neb", - "per", "bay", "adx", "sam", "qua", "fri", "znb", "zpo", "vli", "qpo", "poi", - "pea", "spe", "wil", "ttt", "ltt", "ltt2", "erq", "ere", "erq2", "ere2", - "msf", "zig", "ds2", "ds2x", "lim", "lli", "lli2", "aov", "lao", "lao2", - "kru", "lrm", "llm", "llm2", "rai"), relative = TRUE, +allDA(data, predictor, paired = NULL, covars = NULL, tests = c("mva", + "neb", "per", "bay", "adx", "sam", "qua", "fri", "znb", "zpo", "vli", + "qpo", "poi", "pea", "spe", "wil", "ttt", "ltt", "ltt2", "erq", "ere", + "erq2", "ere2", "msf", "zig", "ds2", "ds2x", "lim", "lli", "lli2", "aov", + "lao", "lao2", "kru", "lrm", "llm", "llm2", "rai", "tta", "ttc", "aoa", + "aoc", "lma", "lmc", "lia", "lic"), relative = TRUE, cores = (detectCores() - 1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE) } @@ -17,13 +18,13 @@ allDA(data, predictor, paired = NULL, covars = NULL, tests = c("neb", \item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses} -\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "ds2x", "lrm", "llm", "llm2", "lim", "lli", "lli2", "zig", "anc", "mva" and "fri"} +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.} \item{covars}{Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} -\item{tests}{Character. Which tests to include. Default all (Except ANCOM and mvabund, see below for details)} +\item{tests}{Character. Which tests to include. Default all} -\item{relative}{Logical. If TRUE (default) abundances are made relative for "ttt", "ltt", "wil", "per", "aov", "lao", "kru", "lim", "lli", "lrm", "llm", "spe" and "pea", and there is an offset of \code{log(LibrarySize)} for "mva", "neb", "poi", "qpo", "zpo" and "znb"} +\item{relative}{Logical. TRUE (default) for compositional data. FALSE for absoloute abundances or pre-normalized data.} \item{cores}{Integer. Number of cores to use for parallel computing. Default one less than available} @@ -43,7 +44,7 @@ allDA(data, predictor, paired = NULL, covars = NULL, tests = c("neb", A list of results: \itemize{ \item raw - A data.frame with raw p-values from all methods - \item adj - A data.frame with adjusted p-values from all methods (detection/no-detection from anc and sam) + \item adj - A data.frame with adjusted p-values from all methods (detection/no-detection from sam) \item est - A data.frame with estimates/fold.changes from all relevant methods \item details - A dataframe with details from the run \item results - A complete list of output from all the methods. Example: Get wilcoxon results from 2. run as such: \code{$results[[2]]["wil"]} @@ -52,94 +53,3 @@ A list of results: \description{ Run many differential abundance and expression tests at a time, to easily compare their results } -\details{ -Currently implemented methods: -\itemize{ - \item per - Permutation test with user defined test statistic - \item bay - baySeq - \item adx - ALDEx t-test and wilcoxon - \item wil - Wilcoxon Rank Sum on relative abundances - \item ttt - Welch t.test on relative abundances - \item ltt - Welch t.test, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item ltt2 - Welch t.test, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item neb - Negative binomial GLM with log of library size as offset - \item erq - EdgeR - Quasi likelihood - TMM normalization - \item ere - EdgeR - Exact test - TMM normalization - \item erq2 - EdgeR - Quasi likelihood - RLE normalization - \item ere2 - EdgeR - Exact test - RLE normalization - \item msf - MetagenomeSeq feature model - \item zig - MetagenomeSeq zero-inflated gaussian - \item ds2 - DESeq2 - \item ds2x - DESeq2 with manual geometric means - \item lim - LIMMA. Moderated linear models based on emperical bayes - \item lli - LIMMA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item lli2 - LIMMA, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item kru - Kruskal-Wallis on relative abundances - \item aov - ANOVA on relative abundances - \item lao - ANOVA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item lao2 - ANOVA, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item lrm - Linear regression on relative abundances - \item llm - Linear regression, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item llm2 - Linear regression, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item rai - RAIDA - \item spe - Spearman correlation - \item pea - Pearson correlation - \item poi - Poisson GLM with log of library size as offset - \item qpo - Quasi-Poisson GLM with log of library size as offset - \item vli - Limma with voom - \item zpo - Zero-inflated Poisson GLM - \item znb - Zero-inflated Negative Binomial GLM - \item fri - Friedman Rank Sum test - \item qua - Quade test - \item anc - ANCOM (by default not included, as it is very slow) - \item sam - SAMSeq - \item mva - mvabund (by default not included, as it is very slow) - \item zzz - A user-defined method (See \code{?DA.zzz}) -} - -Additional arguments can be passed to the internal functions with the \code{args} argument. -It should be structured as a list with elements named by the tests: -E.g. passing to the \code{DA.per} function that it should only run 1000 iterations: \code{args = list(per=list(noOfIterations=1000))}. -Include that the log t.test should use a pseudocount of 0.1: \code{args = list(per=list(noOfIterations=1000), ltt=list(delta=0.1))}. -Additional arguments are simply seperated by commas. - -Below is an overview of which functions get the arguments that are passed to a specific test -\itemize{ - \item per - Passed to \code{DA.per} - \item bay - Passed to \code{getPriors}, \code{getLikelihoods} and \code{DA.bay} - \item adx - Passed to \code{aldex} and \code{DA.adx} - \item wil - Passed to \code{wilcox.test} and \code{DA.wil} - \item ttt - Passed to \code{t.test} and \code{DA.ttt} - \item ltt - Passed to \code{t.test} and \code{DA.ltt} - \item ltt2 - Passed to \code{t.test} and \code{DA.ltt2} - \item neb - Passed to \code{glm.nb}, \code{glmer.nb} and \code{DA.neb} - \item erq(2) - Passed to \code{calcNormFactors}, \code{estimateDisp}, \code{glmQLFit}, \code{glmQLFTest} and \code{DA.erq} - \item ere(2) - Passed to \code{calcNormFactors}, \code{estimateCommonDisp}, \code{estimateTagwiseDisp}, \code{exactTest} and \code{DA.ere} - \item msf - Passed to \code{fitFeatureModel} and \code{DA.msf} - \item zig - Passed to \code{fitZig} and \code{DA.zig} - \item ds2(x) - Passed to \code{DESeq} and \code{DA.ds2} - \item lim - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lim} - \item lli - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli} - \item lli2 - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli2} - \item kru - Passed to \code{kruskal.test} and \code{DA.kru} - \item aov - Passed to \code{aov} and \code{DA.aov} - \item lao - Passed to \code{aov} and \code{DA.lao} - \item lao2 - Passed to \code{aov} and \code{DA.lao2} - \item lrm - Passed to \code{lm}, \code{lme} and \code{DA.lrm} - \item llm - Passed to \code{lm}, \code{lme} and \code{DA.llm} - \item llm2 - Passed to \code{lm}, \code{lme} and \code{DA.llm2} - \item rai - Passed to \code{raida} and \code{DA.rai} - \item spe - Passed to \code{cor.test} and \code{DA.spe} - \item pea - Passed to \code{cor.test} and \code{DA.pea} - \item poi - Passed to \code{glm}, \code{glmer} and \code{DA.poi} - \item qpo - Passed to \code{glm} and \code{DA.qpo} - \item vli - Passed to \code{voom}, \code{eBayes}, \code{lmFit} and \code{DA.vli} - \item zpo - Passed to \code{zeroinfl} and \code{DA.zpo} - \item znb - Passed to \code{zeroinfl} and \code{DA.znb} - \item fri - Passed to \code{friedman.test} and \code{DA.fri} - \item qua - Passed to \code{quade.test} and \code{DA.qua} - \item anc - Passed to \code{ANCOM} and \code{DA.anc} - \item sam - Passed to \code{SAMseq} and \code{DA.sam} - \item mva - Passed to \code{manyglm} and \code{summary.manyglm} -} -} diff --git a/man/featurePlot.Rd b/man/featurePlot.Rd index ec3448e..e5de922 100644 --- a/man/featurePlot.Rd +++ b/man/featurePlot.Rd @@ -4,9 +4,9 @@ \alias{featurePlot} \title{Plot association between abundance of a feature and predictor} \usage{ -featurePlot(data, predictor, paired = NULL, covars = NULL, feature = NULL, - relative = TRUE, logScale = FALSE, delta = 0.001, covar.quant = c(0, - 1/3, 2/3, 1)) +featurePlot(data, predictor, paired = NULL, covars = NULL, + feature = NULL, relative = TRUE, logScale = FALSE, delta = 0.001, + covar.quant = c(0, 1/3, 2/3, 1)) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames} diff --git a/man/groupSig.Rd b/man/groupSig.Rd deleted file mode 100644 index a08d36a..0000000 --- a/man/groupSig.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/groupSig.R -\name{groupSig} -\alias{groupSig} -\title{Are some groups/taxa overrepresented among significant features} -\usage{ -groupSig(results, group.df, group.cols = 1, split = TRUE, alpha = 0.05, - p.adj = "fdr", alternative = "greater") -} -\arguments{ -\item{results}{Data.frame with results from a \code{DAtest} function} - -\item{group.df}{Data.frame with columns defining the groups. rownames should name the features matching the \code{Feature} column in \code{results}. E.g. \code{tax_table} from a \code{phyloseq} object} - -\item{group.cols}{Numeric vector defining which column(s) contain(s) the groups in \code{group.df}. Default first column.} - -\item{split}{If TRUE will split tests in positive and negative effect sizes if possible. Default TRUE} - -\item{alpha}{Threshold for significance calling. Default 0.05} - -\item{p.adj}{Method for p-value adjustment. Default "fdr"} - -\item{alternative}{What to test for. "greater" (default) is testing only overrepresentation (OR > 1), "less" only underrepresentation (OR < 1), and "two.sided" tests over- and under-representation (OR != 1)} -} -\value{ -A data.frame with odds ratios (OR), p-values, adjusted p-values, groups, name of groups, and direction of effect if split = TRUE -} -\description{ -Test if some groups of features are overpresented among significant features. -The groups can be anything; for OTU data e.g. genera/family/order/class/phylum, for transciptomics e.g. KEGG pathway. -} -\details{ -OR in output is odds ratio from fisher's exact test. If OR is above 1 it means that the group is overrepresented among significant features. -} diff --git a/man/plot.DA.Rd b/man/plot.DA.Rd index 35be0b4..0e6785f 100644 --- a/man/plot.DA.Rd +++ b/man/plot.DA.Rd @@ -9,7 +9,7 @@ \arguments{ \item{x}{The output from the \code{testDA} function} -\item{sort}{Sort methods by median \code{c("AUC","FDR","Spike.detect.rate","Score")}} +\item{sort}{Sort methods by median \code{c("AUC","FDR","Power","Score")}} \item{p}{Logical. Should the p-value distribution be plotted (only p-values from non-spiked features)} diff --git a/man/powerDA.Rd b/man/powerDA.Rd index 8dc420e..396328e 100644 --- a/man/powerDA.Rd +++ b/man/powerDA.Rd @@ -15,7 +15,7 @@ powerDA(data, predictor, paired = NULL, covars = NULL, test = NULL, \item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses} -\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "anc", "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "ds2x", "lrm", "llm", "llm2", "lim", "lli", "lli2", "mva", and "zig"} +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.} \item{covars}{Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} @@ -25,13 +25,13 @@ powerDA(data, predictor, paired = NULL, covars = NULL, test = NULL, \item{alpha.p}{p-value threshold for false positive rates. Default 0.05} -\item{alpha.q}{q-value threshold for determining significance for \code{empirical power}. Default 0.1. This will change \code{fdr.output} for "sam" and \code{sig} for "anc".} +\item{alpha.q}{q-value threshold for determining significance for \code{empirical power}. Default 0.1. This will change \code{fdr.output} for "sam".} \item{p.adj}{Character. Method for p-value adjustment. See \code{p.adjust} for details. Default "fdr"} \item{R}{Integer. Number of times to run the tests. Default 5} -\item{relative}{Logical. If TRUE (default) abundances are made relative for "ttt", "ltt", "wil", "per", "aov", "lao", "kru", "lim", "lli", "lrm", "llm", "spe" and "pea", and there is an offset of \code{log(LibrarySize)} for "mva", "neb", "poi", "qpo", "zpo" and "znb"} +\item{relative}{Logical. TRUE (default) for compositional data, FALSE for absolute abundances or pre-normalized data.} \item{k}{Vector of length 3. Number of Features to spike in each tertile (lower, mid, upper). E.g. \code{k=c(5,10,15)}: 5 features spiked in low abundance tertile, 10 features spiked in mid abundance tertile and 15 features spiked in high abundance tertile. Default NULL, which will spike 2 percent of the total amount of features in each tertile (a total of 6 percent), but minimum c(5,5,5)} @@ -41,7 +41,7 @@ powerDA(data, predictor, paired = NULL, covars = NULL, test = NULL, \item{args}{List. A list with lists of arguments passed to the different methods. See details for more.} -\item{out.all}{If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and lim, lli, lli2, lim, vli will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise} +\item{out.all}{If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and limma will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise} \item{core.check}{If TRUE will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers.} diff --git a/man/prune.tests.DA.Rd b/man/prune.tests.DA.Rd index bb70016..03d2223 100644 --- a/man/prune.tests.DA.Rd +++ b/man/prune.tests.DA.Rd @@ -4,7 +4,7 @@ \alias{prune.tests.DA} \title{Prune tests argument for \code{testDA}} \usage{ -prune.tests.DA(tests, predictor, paired, covars, relative, decimal) +prune.tests.DA(tests, predictor, paired, covars, relative, decimal, zeroes) } \arguments{ \item{tests}{A character vector with names of tests} @@ -18,6 +18,8 @@ prune.tests.DA(tests, predictor, paired, covars, relative, decimal) \item{relative}{Include tests that work with relative abundances (TRUE) or only absolute abundances (FALSE)} \item{decimal}{Exclude tests that do not work with decimals (TRUE)} + +\item{zeroes}{If FALSE will exclude tests that include zero-inflation} } \description{ Prune tests argument for \code{testDA} diff --git a/man/runtimeDA.Rd b/man/runtimeDA.Rd index 26662cf..20ffd3d 100644 --- a/man/runtimeDA.Rd +++ b/man/runtimeDA.Rd @@ -5,19 +5,20 @@ \title{Estimate runtime of \code{testDA} on large datasets} \usage{ runtimeDA(data, predictor, paired = NULL, covars = NULL, - subsamples = c(500, 1000, 1500, 2000), subsamples.slow = c(100, 150, 200, - 250), tests = c("sam", "qua", "fri", "vli", "qpo", "pea", "wil", "ttt", - "ltt", "ltt2", "ere", "ere2", "msf", "zig", "lim", "lli", "lli2", "aov", - "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe"), tests.slow = c("neb", - "bay", "per", "zpo", "znb", "rai", "adx", "ds2", "ds2x", "poi", "erq", - "erq2"), cores = (detectCores() - 1), ...) + subsamples = c(500, 1000, 1500, 2000), subsamples.slow = c(100, 150, + 200, 250), tests = c("sam", "qua", "fri", "vli", "qpo", "pea", "wil", + "ttt", "ltt", "ltt2", "ere", "ere2", "msf", "zig", "lim", "lli", "lli2", + "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe", "aoa", "aoc", + "aoi", "tta", "ttc", "tti", "lma", "lmc", "lmi", "lia", "lic", "lii"), + tests.slow = c("mva", "neb", "bay", "per", "zpo", "znb", "rai", "adx", + "ds2", "ds2x", "poi", "erq", "erq2"), cores = (detectCores() - 1), ...) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames} \item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses} -\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "anc", "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "lrm", "llm", "llm2", "lim", "lli", "lli2" and "zig"} +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "lrm", "llm", "llm2", "lim", "lli", "lli2" and "zig"} \item{covars}{Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} @@ -37,7 +38,7 @@ runtimeDA(data, predictor, paired = NULL, covars = NULL, A data.frame with estimated runtimes for 1 run } \description{ -Estimate the runtime of \code{testDA} from running on a subset of the features. Intended for datasets with at least 5000 features. +Estimate the runtime of \code{testDA} from running on a subset of the features. Intended for datasets with at least 2000 features. } \details{ Outputs the estimated times for running each method 1 time. With cores=1 the runtime will be the sum of them all. With more cores the actual runtime will decrease asymptotically towards the slowest test diff --git a/man/summary.DA.Rd b/man/summary.DA.Rd index 8cdeabf..67d1412 100644 --- a/man/summary.DA.Rd +++ b/man/summary.DA.Rd @@ -4,13 +4,13 @@ \alias{summary.DA} \title{Summary of results from \code{testDA}} \usage{ -\method{summary}{DA}(object, sort = "Score", boot = TRUE, prob = c(0.05, - 0.95), N = 1000, boot.seed = 1, ...) +\method{summary}{DA}(object, sort = "Score", boot = TRUE, + prob = c(0.05, 0.95), N = 1000, boot.seed = 1, decimals = 2, ...) } \arguments{ \item{object}{The output from the \code{testDA} function} -\item{sort}{Sort methods by \code{c("AUC","FPR","Spike.detect.rate","Score")}} +\item{sort}{Sort methods by \code{c("AUC","FPR","Power","Score")}} \item{boot}{If TRUE will use bootstrap for confidence limits of the Score, else will compute the limits from the original table. Recommended to be TRUE unless \code{R >= 100} in \code{testDA}} @@ -20,6 +20,8 @@ \item{boot.seed}{Random seed for reproducibility of bootstraps} +\item{decimals}{Precision in output. Default 2} + \item{...}{Additional arguments for \code{print}} } \description{ diff --git a/man/testDA.Rd b/man/testDA.Rd index efcf038..0bd7d58 100644 --- a/man/testDA.Rd +++ b/man/testDA.Rd @@ -2,31 +2,32 @@ % Please edit documentation in R/testDA.R \name{testDA} \alias{testDA} -\title{Comparing differential abundance/expression methods by FPR and AUC} +\title{Comparing differential abundance/expression methods by Empirical power and False Discovery Rate} \usage{ -testDA(data, predictor, paired = NULL, covars = NULL, R = 10, - tests = c("neb", "rai", "per", "bay", "adx", "sam", "qua", "fri", "zpo", - "znb", "vli", "qpo", "poi", "pea", "wil", "ttt", "ltt", "ltt2", "erq", "erq2", - "ere", "ere2", "msf", "zig", "ds2", "ds2x", "lim", "lli", "lli2", "aov", - "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe"), relative = TRUE, - effectSize = 5, k = NULL, cores = (detectCores() - 1), rng.seed = 123, - p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, - core.check = TRUE, verbose = TRUE) +testDA(data, predictor, paired = NULL, covars = NULL, R = 20, + tests = c("mva", "neb", "rai", "per", "bay", "adx", "sam", "qua", + "fri", "zpo", "znb", "vli", "qpo", "poi", "pea", "wil", "ttt", "ltt", + "ltt2", "erq", "erq2", "ere", "ere2", "msf", "zig", "ds2", "ds2x", "lim", + "lli", "lli2", "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe", + "tta", "ttc", "aoa", "aoc", "lma", "lmc", "lia", "lic"), + relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores() + - 1), rng.seed = 123, p.adj = "fdr", args = list(), + out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE) } \arguments{ \item{data}{Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames} \item{predictor}{The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses} -\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. Only for "anc", "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "ds2x", "lrm", "llm", "llm2", "lim", "lli", "lli2", "mva", and "zig"} +\item{paired}{For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.} \item{covars}{Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}} -\item{R}{Integer. Number of times to run the tests. Default 10} +\item{R}{Integer. Number of times to run the tests. Default 20} -\item{tests}{Character. Which tests to include. Default all (Except ANCOM and mvabund, see below for details)} +\item{tests}{Character. Which tests to include. Default all} -\item{relative}{Logical. If TRUE (default) abundances are made relative for "ttt", "ltt", "wil", "per", "aov", "lao", "kru", "lim", "lli", "lrm", "llm", "spe" and "pea", and there is an offset of \code{log(LibrarySize)} for "mva", "neb", "poi", "qpo", "zpo" and "znb"} +\item{relative}{Logical. TRUE (default) for compositional data. FALSE for absolute abundances or pre-normalized data.} \item{effectSize}{Numeric. The effect size for the spike-ins. Default 5} @@ -40,9 +41,9 @@ testDA(data, predictor, paired = NULL, covars = NULL, R = 10, \item{args}{List. A list with lists of arguments passed to the different methods. See details for more.} -\item{out.all}{If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and lim, lli, lli2, lim, vli will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise} +\item{out.all}{If TRUE linear models will output results and p-values from \code{anova}/\code{drop1}, ds2/ds2x will run LRT and not Wald test, erq and erq2 will produce one p-value for the predictor, and limma will run F-tests. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class predictors and FALSE otherwise} -\item{alpha}{q-value threshold for determining significance for \code{Spike.detect.rate}. Default 0.1} +\item{alpha}{q-value threshold for determining significance for \code{Power}. Default 0.1} \item{core.check}{If TRUE will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers.} @@ -58,99 +59,5 @@ An object of class \code{DA}, which contains a list of results: } } \description{ -Calculating false positive rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods -} -\details{ -Currently implemented methods: -\itemize{ - \item per - Permutation test with user defined test statistic - \item bay - baySeq - \item adx - ALDEx t-test and wilcoxon - \item wil - Wilcoxon Rank Sum on relative abundances - \item ttt - Welch t.test on relative abundances - \item ltt - Welch t.test, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item ltt2 - Welch t.test, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item neb - Negative binomial GLM with log of library size as offset - \item erq - EdgeR - Quasi likelihood - TMM normalization - \item ere - EdgeR - Exact test - TMM normalization - \item erq2 - EdgeR - Quasi likelihood - RLE normalization - \item ere2 - EdgeR - Exact test - RLE normalization - \item msf - MetagenomeSeq feature model - \item zig - MetagenomeSeq zero-inflated gaussian - \item ds2 - DESeq2 - \item ds2x - DESeq2 with manual geometric means - \item lim - LIMMA. Moderated linear models based on emperical bayes - \item lli - LIMMA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item lli2 - LIMMA, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item kru - Kruskal-Wallis on relative abundances - \item aov - ANOVA on relative abundances - \item lao - ANOVA, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item lao2 - ANOVA, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item lrm - Linear regression on relative abundances - \item llm - Linear regression, but reads are first transformed with \code{log(abundance + delta1)} then turned into relative abundances - \item llm2 - Linear regression, but with relative abundances transformed with \code{log(relative abundance + delta2)} - \item rai - RAIDA - \item spe - Spearman correlation - \item pea - Pearson correlation - \item poi - Poisson GLM with log of library size as offset - \item qpo - Quasi-Poisson GLM with log of library size as offset - \item vli - Limma with voom - \item zpo - Zero-inflated Poisson GLM - \item znb - Zero-inflated Negative Binomial GLM - \item fri - Friedman Rank Sum test - \item qua - Quade test - \item anc - ANCOM (by default not included, as it is very slow) - \item sam - SAMSeq - \item mva - mvabund (by default not included, as it is very slow) - \item zzz - A user-defined method (See \code{?DA.zzz}) -} -"neb" can be slow if there is a paired argument. - -"anc" and "mva" are slow compared to the other methods. - -Additional arguments can be passed to the internal functions with the \code{args} argument. -It should be structured as a list with elements named by the tests: -E.g. passing to the \code{DA.per} function that it should only run 1000 iterations: \code{args = list(per=list(noOfIterations=1000))}. -Include that the log t.test should use a pseudocount of 0.1: \code{args = list(per=list(noOfIterations=1000), ltt=list(delta=0.1))}. -Additional arguments are simply seperated by commas. - -Below is an overview of which functions get the arguments that are passed to a specific test -\itemize{ - \item per - Passed to \code{DA.per} - \item bay - Passed to \code{getPriors}, \code{getLikelihoods} and \code{DA.bay} - \item adx - Passed to \code{aldex} and \code{DA.adx} - \item wil - Passed to \code{wilcox.test} and \code{DA.wil} - \item ttt - Passed to \code{t.test} and \code{DA.ttt} - \item ltt - Passed to \code{t.test} and \code{DA.ltt} - \item ltt2 - Passed to \code{t.test} and \code{DA.ltt2} - \item neb - Passed to \code{glm.nb}, \code{glmer.nb} and \code{DA.neb} - \item erq(2) - Passed to \code{calcNormFactors}, \code{estimateDisp}, \code{glmQLFit}, \code{glmQLFTest} and \code{DA.erq} - \item ere(2) - Passed to \code{calcNormFactors}, \code{estimateCommonDisp}, \code{estimateTagwiseDisp}, \code{exactTest} and \code{DA.ere} - \item msf - Passed to \code{fitFeatureModel} and \code{DA.msf} - \item zig - Passed to \code{fitZig} and \code{DA.zig} - \item ds2(x) - Passed to \code{DESeq} and \code{DA.ds2} - \item lim - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lim} - \item lli - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli} - \item lli2 - Passed to \code{eBayes}, \code{lmFit} and \code{DA.lli2} - \item kru - Passed to \code{kruskal.test} and \code{DA.kru} - \item aov - Passed to \code{aov} and \code{DA.aov} - \item lao - Passed to \code{aov} and \code{DA.lao} - \item lao2 - Passed to \code{aov} and \code{DA.lao2} - \item lrm - Passed to \code{lm}, \code{lme} and \code{DA.lrm} - \item llm - Passed to \code{lm}, \code{lme} and \code{DA.llm} - \item llm2 - Passed to \code{lm}, \code{lme} and \code{DA.llm2} - \item rai - Passed to \code{raida} and \code{DA.rai} - \item spe - Passed to \code{cor.test} and \code{DA.spe} - \item pea - Passed to \code{cor.test} and \code{DA.pea} - \item poi - Passed to \code{glm}, \code{glmer} and \code{DA.poi} - \item qpo - Passed to \code{glm} and \code{DA.qpo} - \item vli - Passed to \code{voom}, \code{eBayes}, \code{lmFit} and \code{DA.vli} - \item zpo - Passed to \code{zeroinfl} and \code{DA.zpo} - \item znb - Passed to \code{zeroinfl} and \code{DA.znb} - \item fri - Passed to \code{friedman.test} and \code{DA.fri} - \item qua - Passed to \code{quade.test} and \code{DA.qua} - \item anc - Passed to \code{ANCOM} and \code{DA.anc} - \item sam - Passed to \code{SAMseq} and \code{DA.sam} - \item mva - Passed to \code{manyglm} and \code{summary.manyglm} -} +Calculating Power, False Discovery Rates, False Positive Rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods } diff --git a/script/script_covars.html b/script/script_covars.html deleted file mode 100644 index e658dfa..0000000 --- a/script/script_covars.html +++ /dev/null @@ -1,292 +0,0 @@ - - - - -
- - - - - - - - - -This is an example of using the DAtest package for when you have covariables. There will be a focus on the covariables only, and I refer to the other examples for examning output for the predictor. The example uses the phyloseq package (https://joey711.github.io/phyloseq/), but a similar workflow can be used if your count-data is stored in a data.frame or matrix.
-library(phyloseq)
-library(DAtest)
-## DAtest version 2.7.1
-These files are downloaded from https://www.hmpdacc.org/hmp/HMQCP/
-otu <- import_qiime_otu_tax("v35_psn_otu.genus.fixed.txt")
-map <- import_qiime_sample_data("v35_map_uniquebyPSN.txt")
-otu.tab <- otu_table(otu[[1]], taxa_are_rows = TRUE)
-tax.tab <- tax_table(otu[[2]])
-all_samples <- merge_phyloseq(otu.tab, tax.tab, map)
-This is just an arbitrary subset just for testing the package
-# Only Tongue and Saliva samples
-TS <- subset_samples(all_samples, HMPbodysubsite %in% c("Tongue_dorsum","Saliva"))
-# Only samples with at least 5000 reads
-TS <- subset_samples(TS, sample_sums(TS) > 5000)
-# Only samples from patients first visit
-TS <- subset_samples(TS, visitno == 1)
-# Only OTUs with at least 100 reads
-TS <- preDA(TS, min.samples = 0, min.reads = 100, min.abundance = 0)
-## 43778 features grouped as 'Others' in the output
-The general workflow does not change even though you have covariates, you just include it in testDA
(see the other examples for the general workflow). For many tests you can see the effect of the covariate in the output, and for linear models (lrm, llm, llm2, neb, poi, qpo, znb, zpo) we can test the significance with either anova
or drop1
results.llm <- DA.llm(TS, predictor = "HMPbodysubsite", covars = "sex")
-head(results.llm)
-## Feature (Intercept) predictorTongue_dorsum sexmale
-## 1 Others 0.0069348475 -2.925969e-04 -4.205043e-04
-## 2 OTU_97.10006 0.0005380417 -4.224479e-04 2.837140e-05
-## 3 OTU_97.1005 0.0006329806 -3.095692e-04 1.253626e-04
-## 4 OTU_97.10081 0.0001435740 4.397133e-05 1.960696e-04
-## 5 OTU_97.101 0.0003578440 1.618811e-04 -1.974949e-05
-## 6 OTU_97.10107 0.0003118452 7.776225e-05 7.062621e-05
-## pval log2FC ordering pval.adj
-## 1 2.247188e-01 -0.06219201 Saliva>Tongue_dorsum 3.211588e-01
-## 2 2.861533e-06 -2.21865410 Saliva>Tongue_dorsum 1.868139e-05
-## 3 1.175242e-03 -0.96879058 Saliva>Tongue_dorsum 3.406929e-03
-## 4 6.185923e-01 0.38544476 Tongue_dorsum>Saliva 7.188562e-01
-## 5 7.611039e-02 0.53841791 Tongue_dorsum>Saliva 1.258839e-01
-## 6 3.294143e-01 0.32119116 Tongue_dorsum>Saliva 4.375843e-01
-## Method Rank1 Phylum Class
-## 1 Log Linear reg. (llm) <NA> <NA> <NA>
-## 2 Log Linear reg. (llm) Root Actinobacteria Actinobacteria
-## 3 Log Linear reg. (llm) Root Firmicutes Clostridia
-## 4 Log Linear reg. (llm) Root Firmicutes Clostridia
-## 5 Log Linear reg. (llm) Root Fusobacteria Fusobacteria
-## 6 Log Linear reg. (llm) Root Fusobacteria Fusobacteria
-## Order Family Genus
-## 1 <NA> <NA> <NA>
-## 2 Actinomycetales Actinomycetaceae Actinomyces
-## 3 Clostridiales Lachnospiraceae Oribacterium
-## 4 Clostridiales ClostridialesFamilyXI.IncertaeSedis <NA>
-## 5 Fusobacteriales Fusobacteriaceae Fusobacterium
-## 6 Fusobacteriales Fusobacteriaceae Fusobacterium
-We can see the effect size of the covar (sexmale), but we don’t have a p-value for it. pval.adj
is associated with the predictor
With anova
(or drop1
) we get a p-value for both predictor
and all covars
:
results.llm.all <- DA.llm(TS, predictor = "HMPbodysubsite", covars = "sex", allResults = TRUE)
-results.llm.anova <- DA.anova(results.llm.all)
-head(results.llm.anova)
-## pval_predictor pval_sex pval.adj_predictor pval.adj_sex
-## OTU_97.10006 2.985898e-06 0.73997816 1.941438e-05 0.9379676
-## OTU_97.1005 1.410167e-03 0.17490724 3.987196e-03 0.6081197
-## OTU_97.10081 5.521396e-01 0.02507402 6.578161e-01 0.3403684
-## OTU_97.101 7.739311e-02 0.82486357 1.272194e-01 0.9587063
-## OTU_97.10107 3.103445e-01 0.36740258 4.153443e-01 0.7357214
-## OTU_97.10169 2.068794e-10 0.42382277 6.389391e-09 0.7759570
-results.llm.d1 <- DA.drop1(results.llm.all)
-head(results.llm.d1)
-## AIC_<none> AIC_predictor AIC_sex pval_predictor pval_sex
-## OTU_97.10006 -2292.601 -2272.178 -2294.488 2.187946e-06 0.7370446
-## OTU_97.1005 -2269.814 -2261.038 -2269.931 1.027831e-03 0.1699106
-## OTU_97.10081 -2287.968 -2289.715 -2284.831 6.144880e-01 0.0234214
-## OTU_97.101 -2279.487 -2278.267 -2281.437 7.273584e-02 0.8228460
-## OTU_97.10107 -2319.437 -2320.463 -2320.605 3.237814e-01 0.3618181
-## OTU_97.10169 -2364.107 -2325.186 -2365.452 1.584656e-10 0.4184148
-## pval.adj_predictor pval.adj_sex
-## OTU_97.10006 1.428391e-05 0.9342492
-## OTU_97.1005 2.979598e-03 0.5908401
-## OTU_97.10081 7.140866e-01 0.3181960
-## OTU_97.101 1.203025e-01 0.9564010
-## OTU_97.10107 4.301017e-01 0.7245385
-## OTU_97.10169 4.801808e-09 0.7660978
-## R version 3.3.3 (2017-03-06)
-## Platform: x86_64-w64-mingw32/x64 (64-bit)
-## Running under: Windows 7 x64 (build 7601) Service Pack 1
-##
-## attached base packages:
-## [1] stats graphics grDevices utils datasets methods base
-##
-## other attached packages:
-## [1] DAtest_2.7.1 phyloseq_1.23.1
-##
-## loaded via a namespace (and not attached):
-## [1] Rcpp_0.12.13 nloptr_1.0.4 plyr_1.8.4
-## [4] XVector_0.14.1 iterators_1.0.8 tools_3.3.3
-## [7] zlibbioc_1.20.0 lme4_1.1-14 statmod_1.4.30
-## [10] digest_0.6.12 tibble_1.3.4 jsonlite_1.5
-## [13] evaluate_0.10.1 nlme_3.1-131 rhdf5_2.18.0
-## [16] gtable_0.2.0 lattice_0.20-35 mgcv_1.8-22
-## [19] doSNOW_1.0.15 pkgconfig_2.0.1 rlang_0.1.4
-## [22] igraph_1.1.2 Matrix_1.2-11 foreach_1.4.3
-## [25] yaml_2.1.14 parallel_3.3.3 stringr_1.2.0
-## [28] knitr_1.17 cluster_2.0.6 pROC_1.10.0
-## [31] Biostrings_2.42.1 S4Vectors_0.12.2 IRanges_2.8.2
-## [34] cowplot_0.9.1 multtest_2.30.0 stats4_3.3.3
-## [37] rprojroot_1.2 ade4_1.7-8 grid_3.3.3
-## [40] Biobase_2.39.0 data.table_1.10.4-3 snow_0.4-2
-## [43] survival_2.41-3 rmarkdown_1.6 minqa_1.2.4
-## [46] reshape2_1.4.2 ggplot2_2.2.1 magrittr_1.5
-## [49] MASS_7.3-47 splines_3.3.3 backports_1.1.1
-## [52] scales_0.5.0 codetools_0.2-15 htmltools_0.3.6
-## [55] BiocGenerics_0.25.0 biomformat_1.2.0 permute_0.9-4
-## [58] ape_5.0 colorspace_1.3-2 stringi_1.1.5
-## [61] pscl_1.5.2 lazyeval_0.2.1 munsell_0.4.3
-## [64] vegan_2.4-4
-This is an example of using the DAtest package for a multi-class variable. The example uses the phyloseq package (https://joey711.github.io/phyloseq/), but a similar workflow can be used if your count-data is stored in a data.frame or matrix.
-library(phyloseq)
-library(DAtest)
-## DAtest version 2.7.1
-These files are downloaded from https://www.hmpdacc.org/hmp/HMQCP/
-otu <- import_qiime_otu_tax("v35_psn_otu.genus.fixed.txt")
-map <- import_qiime_sample_data("v35_map_uniquebyPSN.txt")
-otu.tab <- otu_table(otu[[1]], taxa_are_rows = TRUE)
-tax.tab <- tax_table(otu[[2]])
-all_samples <- merge_phyloseq(otu.tab, tax.tab, map)
-This is just an arbitrary subset just for testing the package
-# Only Tongue, Saliva and Throat samples
-TST <- subset_samples(all_samples, HMPbodysubsite %in% c("Tongue_dorsum","Saliva","Throat"))
-# Only samples with at least 5000 reads
-TST <- subset_samples(TST, sample_sums(TST) > 5000)
-# Only samples from patients first visit
-TST <- subset_samples(TST, visitno == 1)
-# Only OTUs with at least 100 reads
-TST <- preDA(TST, min.samples = 0, min.reads = 100, min.abundance = 0)
-## 43251 features grouped as 'Others' in the output
-If you don’t have a phyloseq object your count data in a data.frame/matrix should be substituted for TST
and a factor denoting the groups should be substituted for "HMPbodysubsite"
.
test <- testDA(TST, predictor = "HMPbodysubsite")
-## Seed is set to 123
-## predictor is assumed to be a categorical variable with 3 levels: Saliva, Throat, Tongue_dorsum
-First we ensure that the function has loaded the data correctly. For example that the predictor type is as expected, and the number of samples and features are as expected.
-test$details
-##
-## Features 2133
-## Samples 231
-## Predictor Multi-class
-## Ordering Saliva, Throat, Tongue_dorsum
-## Paired No
-## Covars
-## RunTime 22.43 Minutes
-## Relative TRUE
-## EffectSize 4
-## Spiked Low:5, Mid:5, High:5
-## RandomSeed 123
-## OutAll TRUE
-We can plot the results:
-plot(test)
-
-Lets get a summary of the results, and check out the spike detect rate (proportion spiked features with adjusted p-value < 0.05)
-summary(test)
-## Method AUC FPR Spike.detect.rate
-## MgSeq ZIG (zig) 0.974 0.243 0.900
-## Log ANOVA 2 (lao2) 0.960 0.047 0.433
-## Log LIMMA 2 (lli2) 0.960 0.046 0.433
-## DESeq2 (ds2) 0.955 0.082 0.533
-## ZI-Poisson GLM (zpo) 0.952 0.663 1.000
-## ANOVA (aov) 0.951 0.042 0.367
-## LIMMA (lim) 0.951 0.041 0.367
-## Linear regression (lrm) 0.951 0.042 0.367
-## ZI-NegBin GLM (znb) 0.949 0.083 0.633
-## Negbinom GLM (neb) 0.947 0.143 0.733
-## Quasi-Poisson GLM (qpo) 0.945 0.109 0.600
-## Log Linear reg. 2 (llm2) 0.938 0.041 0.267
-## Poisson GLM (poi) 0.934 0.700 1.000
-## EdgeR qll - TMM (erq) 0.932 0.270 0.933
-## EdgeR qll - RLE (erq2) 0.924 0.283 0.933
-## Log LIMMA (lli) 0.900 0.050 0.167
-## Log ANOVA (lao) 0.898 0.051 0.133
-## Log Linear reg. (llm) 0.898 0.051 0.133
-## LIMMA voom (vli) 0.842 0.059 0.133
-## SAMseq (sam) 0.786 0.133 0.133
-## Kruskal-Wallis (kru) 0.689 0.052 0.000
-If your Spike.detect.rate is zero for the best method, you might want to run testDA
again with a higher effectSize, but we would rarely expect a Spike.detect.rate above 0.500. Therefore, as long as the Spike.detect.rate is above zero for the best method we can continue.
Log Limma 2 and Log Anova 2 have the highest AUCs among those that control the false positive rate.
-Lets check out the results from different methods
-Lets run Log Limma 2 with the original data to get the final results:
-results.lli2 <- DA.lli2(TST, predictor = "HMPbodysubsite")
-results.lli2 contains the final results. Alternatively, run DA.lli2
with allResults=TRUE
and use topTable
on the output.
Lets run MG ZIG with the original data to get the final results:
-results.zig <- DA.zig(TST, predictor = "HMPbodysubsite")
-## it= 0, nll=355.38, log10(eps+1)=Inf, stillActive=2133
-## it= 1, nll=379.22, log10(eps+1)=0.03, stillActive=256
-## it= 2, nll=379.23, log10(eps+1)=0.04, stillActive=162
-## it= 3, nll=379.31, log10(eps+1)=0.06, stillActive=85
-## it= 4, nll=379.45, log10(eps+1)=0.06, stillActive=48
-## it= 5, nll=379.43, log10(eps+1)=0.02, stillActive=32
-## it= 6, nll=379.39, log10(eps+1)=0.02, stillActive=20
-## it= 7, nll=379.37, log10(eps+1)=0.02, stillActive=14
-## it= 8, nll=379.35, log10(eps+1)=0.01, stillActive=12
-## it= 9, nll=379.31, log10(eps+1)=0.03, stillActive=11
-With DA.zig
you have to use the by
argument to specify which level of the predictor
you want to compare against the intercept (first level of predictor).
Lets run DESeq2 with the original data to get the final results:
-results.ds2 <- DA.ds2(TST, predictor = "HMPbodysubsite")
-With DA.ds2
the coeff
argument specifies which level of the predictor the log2FoldChange
is associated with. Alternatively, run DA.ds2
with allResults=TRUE
and use results
on the output.
Lets run EdgeR with the original data to get the final results:
-results.erq <- DA.erq(TST, predictor = "HMPbodysubsite")
-Lets run ANOVA with the original data and try to do Tukey post-hoc test:
-results.aov <- DA.aov(TST, predictor = "HMPbodysubsite", allResults = TRUE)
-results.tukey <- DA.TukeyHSD(results.aov)
-results.tukey contains the results of the post-hoc tests.
-Lets run linear regression with the original data and try to do a post-hoc test:
-results.lrm.all <- DA.lrm(TST, predictor = "HMPbodysubsite", allResults = TRUE)
-results.lsmeans <- DA.lsmeans(results.lrm.all)
-results.lsmeans contains the results of the post-hoc tests.
-Lets see the results of the most significant OTU:
-# Lowest p-value
-results.lrm <- DA.lrm(TST, predictor = "HMPbodysubsite")
-low.p.feat <- results.lrm[which.min(results.lrm$pval.adj),"Feature"]
-results.lsmeans[rownames(results.lsmeans) == low.p.feat,c(1:3,7:9)]
-## estimate_Saliva - Throat estimate_Saliva - Tongue_dorsum
-## Others 0.04010862 0.07164246
-## estimate_Throat - Tongue_dorsum pval.adj_Saliva - Throat
-## Others 0.03153383 0.0007477958
-## pval.adj_Saliva - Tongue_dorsum pval.adj_Throat - Tongue_dorsum
-## Others 2.036571e-11 0.008338416
-All pairwise comparisons are significant. Lets plot it to see how it looks
-# Plot it
-featurePlot(TST, predictor = "HMPbodysubsite", feature = low.p.feat)
-
-Intriguingly, it is all the low-abundant OTUs grouped as “Others” that are most significant. It appears that the Saliva samples contain more low abundant OTUs than the Throat and Tongue samples, and that the Throat samples contain more low abundant OTUs than the Tongue samples.
-If the features belong to certains groups, we can test if features from certain groups are more likely to be significant than features from the other groups. For microbiome data we can use a taxonomy table, and if we have a phyloseq object we simply input the tax_table and specify here to use the 2. column (Phylum) to group by. group.df
can be any data.frame as long as it has a grouping variable and the rownames match the “Feature” column in results
.
results.group <- groupSig(results = results.lli2, group.df = tax_table(TST), group.cols = 2)
-head(results.group[order(results.group$pval.adj),])
-## GroupName Group OR pval pval.adj
-## 1 Phylum Actinobacteria 2.321263 1.284079e-07 1.284079e-06
-## 3 Phylum Firmicutes 1.486651 5.062288e-06 2.531144e-05
-## 6 Phylum Proteobacteria 1.634038 1.491901e-05 4.973004e-05
-## 10 Phylum TM7 Inf 8.742939e-02 2.185735e-01
-## 8 Phylum SR1 Inf 2.959176e-01 5.918352e-01
-## 5 Phylum GN02 Inf 5.440901e-01 8.511906e-01
-Here we see that especially OTUs from the Actinobacteria phylum are overrepresented among the significant ones
-Lets try to run all tests and compare their results
-test.all <- allDA(TST, predictor = "HMPbodysubsite")
-## predictor is assumed to be a categorical variable with 3 levels: Saliva, Throat, Tongue_dorsum
-## Seed is set to 123
-We can pull out results from specific methods. E.g. limma (lim):
-head(test.all$results$lim)
-## predictorThroat predictorTongue_dorsum AveExpr F
-## Others -4.010862e-02 -0.0716424571 0.1243036454 35.32148
-## OTU_97.20 1.331853e-03 0.0081066171 0.0077078651 31.94128
-## OTU_97.42321 2.847786e-05 0.0003963542 0.0002177945 31.45706
-## OTU_97.10894 -1.968220e-05 0.0001676175 0.0001021912 29.89545
-## OTU_97.98 -8.801501e-04 -0.0009382869 0.0002830799 28.98109
-## OTU_97.24 -5.341751e-04 -0.0005487106 0.0001793728 28.42873
-## pval pval.adj Feature Method
-## Others 4.256315e-14 9.078719e-11 Others LIMMA (lim)
-## OTU_97.20 5.806904e-13 6.032999e-10 OTU_97.20 LIMMA (lim)
-## OTU_97.42321 8.485230e-13 6.032999e-10 OTU_97.42321 LIMMA (lim)
-## OTU_97.10894 2.908204e-12 1.550800e-09 OTU_97.10894 LIMMA (lim)
-## OTU_97.98 6.019381e-12 2.567868e-09 OTU_97.98 LIMMA (lim)
-## OTU_97.24 9.362194e-12 3.119262e-09 OTU_97.24 LIMMA (lim)
-vennDA(test.all, tests = c("lim","erq","znb"))
-
-## R version 3.3.3 (2017-03-06)
-## Platform: x86_64-w64-mingw32/x64 (64-bit)
-## Running under: Windows 7 x64 (build 7601) Service Pack 1
-##
-## attached base packages:
-## [1] stats4 parallel stats graphics grDevices utils datasets
-## [8] methods base
-##
-## other attached packages:
-## [1] eulerr_2.0.0 ggplot2_2.2.1
-## [3] lsmeans_2.27-61 edgeR_3.16.5
-## [5] DESeq2_1.14.1 SummarizedExperiment_1.4.0
-## [7] GenomicRanges_1.26.4 GenomeInfoDb_1.10.3
-## [9] IRanges_2.8.2 S4Vectors_0.12.2
-## [11] metagenomeSeq_1.16.0 RColorBrewer_1.1-2
-## [13] glmnet_2.0-13 foreach_1.4.3
-## [15] Matrix_1.2-11 Biobase_2.39.0
-## [17] BiocGenerics_0.25.0 limma_3.30.13
-## [19] DAtest_2.7.1 phyloseq_1.23.1
-##
-## loaded via a namespace (and not attached):
-## [1] nlme_3.1-131 bitops_1.0-6 matrixStats_0.52.2
-## [4] bit64_0.9-7 rprojroot_1.2 tools_3.3.3
-## [7] backports_1.1.1 vegan_2.4-4 rpart_4.1-11
-## [10] KernSmooth_2.23-15 DBI_0.7 Hmisc_4.0-3
-## [13] lazyeval_0.2.1 mgcv_1.8-22 colorspace_1.3-2
-## [16] permute_0.9-4 ade4_1.7-8 nnet_7.3-12
-## [19] gridExtra_2.3 bit_1.1-12 compiler_3.3.3
-## [22] htmlTable_1.9 sandwich_2.4-0 labeling_0.3
-## [25] caTools_1.17.1 scales_0.5.0 checkmate_1.8.5
-## [28] mvtnorm_1.0-6 genefilter_1.56.0 stringr_1.2.0
-## [31] digest_0.6.12 foreign_0.8-69 minqa_1.2.4
-## [34] rmarkdown_1.6 XVector_0.14.1 pscl_1.5.2
-## [37] base64enc_0.1-3 pkgconfig_2.0.1 htmltools_0.3.6
-## [40] lme4_1.1-14 htmlwidgets_0.9 rlang_0.1.4
-## [43] RSQLite_2.0 zoo_1.8-0 jsonlite_1.5
-## [46] BiocParallel_1.8.2 gtools_3.5.0 acepack_1.4.1
-## [49] RCurl_1.95-4.8 magrittr_1.5 Formula_1.2-2
-## [52] biomformat_1.2.0 Rcpp_0.12.13 munsell_0.4.3
-## [55] ape_5.0 multcomp_1.4-7 stringi_1.1.5
-## [58] pROC_1.10.0 yaml_2.1.14 MASS_7.3-47
-## [61] zlibbioc_1.20.0 rhdf5_2.18.0 gplots_3.0.1
-## [64] plyr_1.8.4 blob_1.1.0 grid_3.3.3
-## [67] gdata_2.18.0 doSNOW_1.0.15 lattice_0.20-35
-## [70] Biostrings_2.42.1 cowplot_0.9.1 splines_3.3.3
-## [73] multtest_2.30.0 annotate_1.52.1 locfit_1.5-9.1
-## [76] knitr_1.17 igraph_1.1.2 estimability_1.2
-## [79] geneplotter_1.52.0 reshape2_1.4.2 codetools_0.2-15
-## [82] XML_3.98-1.9 evaluate_0.10.1 latticeExtra_0.6-28
-## [85] data.table_1.10.4-3 nloptr_1.0.4 gtable_0.2.0
-## [88] assertthat_0.2.0 xtable_1.8-2 coda_0.19-1
-## [91] survival_2.41-3 tibble_1.3.4 snow_0.4-2
-## [94] iterators_1.0.8 memoise_1.1.0 AnnotationDbi_1.36.2
-## [97] cluster_2.0.6 statmod_1.4.30 TH.data_1.0-8
-This is an example of using the DAtest package for a two-class variable (same workflow as for a quantitative variable). The example uses the phyloseq package (https://joey711.github.io/phyloseq/), but a similar workflow can be used if your count-data is stored in a data.frame or matrix.
-library(phyloseq)
-library(DAtest)
-## DAtest version 2.7.1
-These files are downloaded from https://www.hmpdacc.org/hmp/HMQCP/
-otu <- import_qiime_otu_tax("v35_psn_otu.genus.fixed.txt")
-map <- import_qiime_sample_data("v35_map_uniquebyPSN.txt")
-otu.tab <- otu_table(otu[[1]], taxa_are_rows = TRUE)
-tax.tab <- tax_table(otu[[2]])
-all_samples <- merge_phyloseq(otu.tab, tax.tab, map)
-This is just an arbitrary subset just for testing the package
-# Only Tongue and Saliva samples
-TS <- subset_samples(all_samples, HMPbodysubsite %in% c("Tongue_dorsum","Saliva"))
-# Only samples with at least 5000 reads
-TS <- subset_samples(TS, sample_sums(TS) > 5000)
-# Only samples from patients first visit
-TS <- subset_samples(TS, visitno == 1)
-# Only samples from male patients
-TS <- subset_samples(TS, sex == "male")
-# Only OTUs with at least 100 reads
-TS <- preDA(TS, min.samples = 0, min.reads = 100, min.abundance = 0)
-## 44417 features grouped as 'Others' in the output
-If you don’t have a phyloseq object your count data in a data.frame/matrix should be substituted for TS
and a factor denoting the two groups should be substituted for "HMPbodysubsite"
.
test <- testDA(TS, predictor = "HMPbodysubsite")
-## Seed is set to 123
-## predictor is assumed to be a categorical variable with 2 levels: Saliva, Tongue_dorsum
-First we ensure that the function has loaded the data correctly. For example that the predictor type is as expected, and the number of samples and features are as expected.
-test$details
-##
-## Features 967
-## Samples 75
-## Predictor Two-class
-## Ordering Saliva < Tongue_dorsum
-## Paired No
-## Covars
-## RunTime 16.39 Minutes
-## Relative TRUE
-## EffectSize 4
-## Spiked Low:10, Mid:10, High:10
-## RandomSeed 123
-## OutAll FALSE
-We can plot the results:
-plot(test)
-
-A simple log t-test appears to have the highest AUC among the methods with FPR below 0.05
-Lets get a summary of the results, and check out the spike detect rate (proportion spiked features with adjusted p-value < 0.05)
-summary(test)
-## Method AUC FPR Spike.detect.rate
-## MgSeq ZIG (zig) 0.964 0.151 0.833
-## RAIDA (rai) 0.956 0.080 0.367
-## Log t-test2 (ltt2) 0.930 0.040 0.267
-## MgSeq Feature (msf) 0.927 0.032 0.350
-## t-test (ttt) 0.924 0.032 0.133
-## Log LIMMA 2 (lli2) 0.916 0.035 0.100
-## EdgeR exact - TMM (ere) 0.906 0.116 0.333
-## EdgeR qll - TMM (erq) 0.906 0.132 0.467
-## LIMMA (lim) 0.901 0.033 0.000
-## Linear regression (lrm) 0.900 0.034 0.000
-## SAMseq (sam) 0.894 0.017 0.267
-## DESeq2 (ds2) 0.888 0.077 0.333
-## Negbinom GLM (neb) 0.887 0.119 0.500
-## ZI-Poisson GLM (zpo) 0.883 0.593 0.933
-## Poisson GLM (poi) 0.881 0.602 0.950
-## Log t-test (ltt) 0.879 0.034 0.067
-## ZI-NegBin GLM (znb) 0.878 0.127 0.600
-## EdgeR exact - RLE (ere2) 0.865 0.141 0.450
-## EdgeR qll - RLE (erq2) 0.864 0.156 0.467
-## Log LIMMA (lli) 0.864 0.037 0.033
-## Log Linear reg. 2 (llm2) 0.864 0.027 0.000
-## Log Linear reg. (llm) 0.859 0.038 0.033
-## Quasi-Poisson GLM (qpo) 0.850 0.077 0.033
-## LIMMA voom (vli) 0.843 0.035 0.067
-## Permutation (per) 0.840 0.031 0.183
-## baySeq (bay) 0.839 0.007 0.000
-## ALDEx2 wilcox (adx) 0.814 0.007 0.033
-## ALDEx2 t-test (adx) 0.791 0.003 0.000
-## Wilcox (wil) 0.738 0.027 0.067
-If your Spike.detect.rate is zero for the best method, you might want to run testDA
again with a higher effectSize, but we would rarely expect a Spike.detect.rate above 0.500. Therefore, as long as the Spike.detect.rate is above zero for the best method we can continue.
Log test 2 appeared to be the best method, with the highest AUC and FPR below 0.05. Lets run it with the original data to get the final results:
-results.ltt2 <- DA.ltt2(TS, predictor = "HMPbodysubsite")
-results.ltt2 contains the final results
-# Largest effect
-large.eff.feat <- results.ltt2[which.max(results.ltt2$log2FC),"Feature"]
-
-# Plot it
-featurePlot(TS, predictor = "HMPbodysubsite", feature = large.eff.feat)
-
-If the features belong to certains groups, we can test if features from certain groups are more likely to be significant than features from the other groups. For microbiome data we can use a taxonomy table, and if we have a phyloseq object we simply input the tax_table and specify here to use the 2. column (Phylum) to group by. group.df
can be any data.frame as long as it has a grouping variable and the rownames match the “Feature” column in results
.
results.group <- groupSig(results = results.ltt2, group.df = tax_table(TS), group.cols = 2)
-head(results.group[order(results.group$pval.adj),])
-## GroupName Group Effect OR pval
-## 2 Phylum Actinobacteria Tongue_dorsum>Saliva 4.823208 4.384443e-12
-## 10 Phylum Proteobacteria Saliva>Tongue_dorsum 2.560221 1.549250e-05
-## 6 Phylum Firmicutes Tongue_dorsum>Saliva 1.522715 5.139702e-03
-## 15 Phylum TM7 Saliva>Tongue_dorsum Inf 4.373725e-02
-## 5 Phylum Firmicutes Saliva>Tongue_dorsum 1.127236 2.723350e-01
-## 9 Phylum GN02 Saliva>Tongue_dorsum Inf 3.530021e-01
-## pval.adj
-## 2 6.576664e-11
-## 10 1.161937e-04
-## 6 2.569851e-02
-## 15 1.640147e-01
-## 5 8.170051e-01
-## 9 8.825052e-01
-Here we see that there are more significant Actinobacteria OTUs with higher abundance in Tongue than in Saliva than expected by chance.
-Lets try to run all tests and compare their results
-test.all <- allDA(TS, predictor = "HMPbodysubsite")
-## predictor is assumed to be a categorical variable with 2 levels: Saliva, Tongue_dorsum
-## Seed is set to 123
-We can pull out results from specific methods. E.g. EdgeR exact test (ere):
-head(test.all$results$ere)
-## logFC logCPM pval pval.adj
-## OTU_97.10006 -2.9276075 8.353264 3.048018e-06 3.135567e-05
-## OTU_97.1005 -1.1375858 8.598259 1.830455e-02 3.881689e-02
-## OTU_97.10201 -0.7874759 9.542233 2.005388e-01 2.978818e-01
-## OTU_97.10220 -0.9474005 9.283986 8.682777e-02 1.473026e-01
-## OTU_97.1030 -0.3086652 8.851373 5.817993e-01 6.809887e-01
-## OTU_97.10463 0.7457615 10.215810 1.640010e-01 2.513295e-01
-## ordering Feature Method
-## OTU_97.10006 Saliva>Tongue_dorsum OTU_97.10006 EdgeR exact - TMM (ere)
-## OTU_97.1005 Saliva>Tongue_dorsum OTU_97.1005 EdgeR exact - TMM (ere)
-## OTU_97.10201 Saliva>Tongue_dorsum OTU_97.10201 EdgeR exact - TMM (ere)
-## OTU_97.10220 Saliva>Tongue_dorsum OTU_97.10220 EdgeR exact - TMM (ere)
-## OTU_97.1030 Saliva>Tongue_dorsum OTU_97.1030 EdgeR exact - TMM (ere)
-## OTU_97.10463 Tongue_dorsum>Saliva OTU_97.10463 EdgeR exact - TMM (ere)
-vennDA(test.all, tests = c("msf","adx.w","ltt2"))
-
-Split in negative and positive fold changes
-vennDA(test.all, tests = c("msf","ltt2"), split = TRUE)
-
-## R version 3.3.3 (2017-03-06)
-## Platform: x86_64-w64-mingw32/x64 (64-bit)
-## Running under: Windows 7 x64 (build 7601) Service Pack 1
-##
-## attached base packages:
-## [1] stats graphics grDevices utils datasets methods base
-##
-## other attached packages:
-## [1] eulerr_2.0.0 ggplot2_2.2.1 DAtest_2.7.1 phyloseq_1.23.1
-##
-## loaded via a namespace (and not attached):
-## [1] statmod_1.4.30 reshape2_1.4.2 splines_3.3.3
-## [4] lattice_0.20-35 rhdf5_2.18.0 colorspace_1.3-2
-## [7] doSNOW_1.0.15 htmltools_0.3.6 snow_0.4-2
-## [10] stats4_3.3.3 yaml_2.1.14 mgcv_1.8-22
-## [13] survival_2.41-3 rlang_0.1.4 nloptr_1.0.4
-## [16] BiocGenerics_0.25.0 foreach_1.4.3 plyr_1.8.4
-## [19] stringr_1.2.0 zlibbioc_1.20.0 Biostrings_2.42.1
-## [22] munsell_0.4.3 gtable_0.2.0 codetools_0.2-15
-## [25] evaluate_0.10.1 labeling_0.3 Biobase_2.39.0
-## [28] knitr_1.17 permute_0.9-4 IRanges_2.8.2
-## [31] pscl_1.5.2 biomformat_1.2.0 parallel_3.3.3
-## [34] Rcpp_0.12.13 scales_0.5.0 backports_1.1.1
-## [37] vegan_2.4-4 S4Vectors_0.12.2 jsonlite_1.5
-## [40] XVector_0.14.1 lme4_1.1-14 digest_0.6.12
-## [43] stringi_1.1.5 grid_3.3.3 ade4_1.7-8
-## [46] rprojroot_1.2 cowplot_0.9.1 tools_3.3.3
-## [49] magrittr_1.5 lazyeval_0.2.1 tibble_1.3.4
-## [52] cluster_2.0.6 ape_5.0 pkgconfig_2.0.1
-## [55] MASS_7.3-47 Matrix_1.2-11 data.table_1.10.4-3
-## [58] pROC_1.10.0 assertthat_0.2.0 minqa_1.2.4
-## [61] rmarkdown_1.6 iterators_1.0.8 multtest_2.30.0
-## [64] igraph_1.1.2 nlme_3.1-131 compiler_3.3.3
-