Skip to content

Commit

Permalink
small allDA/testDA speedup, small man changes, extra allDA arg
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakob Russel committed Feb 14, 2019
1 parent 0c16a3d commit e1b7f64
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 30 deletions.
47 changes: 31 additions & 16 deletions R/allDA.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
#'
#' Run many differential abundance and expression tests at a time, to easily compare their results
#'
#' mva and bay are excluded by default, as they often are slow.
#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
#' mva is excluded by default, as it is slow.
#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses
#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.
#' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}
Expand All @@ -16,6 +16,7 @@
#' @param out.all If TRUE models will output results and p-values from \code{anova}/\code{drop1}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise
#' @param alpha q-value threshold for calling significance. Default 0.1
#' @param core.check If TRUE (default) will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers.
#' @param verbose If TRUE will print informative messages
#' @return A list of results:
#' \itemize{
#' \item raw - A data.frame with raw p-values from all methods
Expand All @@ -27,7 +28,17 @@
#'
#' @export

allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb","per","adx","sam","qua","fri","znb","zpo","vli","qpo","poi","pea","spe","wil","ttt","ltt","ltt2","erq","ere","erq2","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE){
allDA <- function(data, predictor, paired = NULL, covars = NULL,
tests = c("bay","ds2","ds2x","per","adx","znb","zpo","msf","zig",
"erq","erq2","neb","qpo","poi","sam",
"lrm","llm","llm2","lma","lmc",
"ere","ere2","pea","spe",
"wil","kru","qua","fri",
"ttt","ltt","ltt2","tta","ttc",
"aov","lao","lao2","aoa","aoc",
"vli","lim","lli","lli2","lia","lic"),
relative = TRUE, cores = (detectCores()-1), rng.seed = 123,
p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){

stopifnot(exists("data"),exists("predictor"))
# Check for servers
Expand All @@ -49,6 +60,10 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
count_table <- data
}

# Coerce data
if(!is.null(paired)) paired <- as.factor(paired)
count_table <- as.data.frame(count_table)

# Checks
if(relative) if(!isTRUE(all(unlist(count_table) == floor(unlist(count_table))))) stop("Count_table must only contain integer values when relative=TRUE")
if(min(count_table) < 0) stop("Count_table contains negative values!")
Expand All @@ -57,7 +72,7 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
if(length(unique(predictor)) < 2) stop("predictor should have at least two levels")

# Remove Features not present in any samples
if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
if(sum(rowSums(count_table) == 0) != 0 && verbose) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
count_table <- count_table[rowSums(count_table) > 0,]

# Prune tests argument
Expand All @@ -69,24 +84,24 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
if(length(tests) == 0) stop("No tests to run!")

# Set seed
message(paste("Seed is set to",rng.seed))
if(verbose) message(paste("Seed is set to",rng.seed))
set.seed(rng.seed)
message(paste("Running on",cores,"cores"))
if(verbose) message(paste("Running on",cores,"cores"))

# predictor
if(any(is.na(predictor))) warning("Predictor contains NAs!")
if(is.numeric(predictor)){
message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE)))
if(verbose) message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE)))
if(length(levels(as.factor(predictor))) == 2){
ANSWER <- readline("The predictor is quantitative, but only contains 2 unique values. Are you sure this is correct? Enter y to proceed ")
if(ANSWER != "y") stop("Wrap the predictor with as.factor(predictor) to treat it is a categorical variable")
}
} else {
if(length(levels(as.factor(predictor))) > length(unique(predictor))) stop("predictor has more levels than unique values!")
message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", ")))
if(verbose) message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", ")))
}
if(!is.null(paired)){
message(paste("The paired variable has",length(unique(paired)),"levels"))
if(verbose) message(paste("The paired variable has",length(unique(paired)),"levels"))
if(length(unique(paired)) < 5) warning("The paired variable has less than 5 levels. Mixed-effect models are excluded")
}

Expand All @@ -102,9 +117,9 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
for(i in 1:length(covars)){
if(any(is.na(covars[[i]]))) warning(names(covars)[i],"contains NAs!")
if(is.numeric(covars[[i]][1])){
message(paste(names(covars)[i],"is assumed to be a quantitative variable, ranging from",min(covars[[i]], na.rm = TRUE),"to",max(covars[[i]], na.rm = TRUE)))
if(verbose) message(paste(names(covars)[i],"is assumed to be a quantitative variable, ranging from",min(covars[[i]], na.rm = TRUE),"to",max(covars[[i]], na.rm = TRUE)))
} else {
message(paste(names(covars)[i],"is assumed to be a categorical variable with",length(unique(covars[[i]])),"levels:",paste(levels(as.factor(covars[[i]])),collapse = ", ")))
if(verbose) message(paste(names(covars)[i],"is assumed to be a categorical variable with",length(unique(covars[[i]])),"levels:",paste(levels(as.factor(covars[[i]])),collapse = ", ")))
}
}
}
Expand All @@ -117,7 +132,7 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
}

# Run tests
cat(paste("Running",length(tests),"methods...\n"))
if(verbose) cat(paste("Running",length(tests),"methods...\n"))
# Progress bar
pb <- txtProgressBar(max = length(tests), style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
Expand Down Expand Up @@ -207,17 +222,17 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
results <- results[!sapply(results,is.null)]
if(length(names(results)) != length(tests)){
if(length(tests) - length(names(results)) == 1){
message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"was excluded due to failure"))
if(verbose) message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"was excluded due to failure"))
} else {
message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"were excluded due to failure"))
if(verbose) message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"were excluded due to failure"))
}

# Produce informative messages
if(all(tests[!tests %in% unique(gsub(".*_","",names(results)))] == "sam")){
message("sam usually fails if some samples has too many zeroes")
if(verbose) message("sam usually fails if some samples has too many zeroes")
}
if(all(c("sam","ere2","erq2","ds2x") %in% tests[!tests %in% unique(gsub(".*_","",names(results)))])){
message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero")
if(verbose) message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero")
}

tests <- names(results)
Expand Down
2 changes: 1 addition & 1 deletion R/powerDA.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Estimating (empirical) statistical power
#'
#' Estimating (empirical) statistical power for a specific differential abundance and expression method on a specific dataset
#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses
#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.
#' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}
Expand Down
2 changes: 1 addition & 1 deletion R/runtimeDA.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#' @export
runtimeDA <- function(data, predictor, paired = NULL, covars = NULL, subsamples = c(500,1000,1500,2000), subsamples.slow = c(100,150,200,250),
tests = c("sam", "qua", "fri", "vli", "qpo", "pea", "wil", "ttt", "ltt", "ltt2","ere", "ere2", "msf", "zig", "lim", "lli", "lli2", "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe", "aoa", "aoc", "tta", "ttc", "lma", "lmc", "lia", "lic"),
tests.slow = c("mva", "neb", "bay", "per", "zpo", "znb", "adx", "ds2", "ds2x", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){
tests.slow = c("mva", "neb", "bay", "per", "ds2", "ds2x", "zpo", "znb", "adx", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){

stopifnot(exists("data"),exists("predictor"))

Expand Down
34 changes: 22 additions & 12 deletions R/testDA.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
#'
#' Calculating Power, False Discovery Rates, False Positive Rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods
#'
#' mva and bay are excluded by default, as they often are slow.
#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
#' mva is excluded by default, as it is slow.
#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses
#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.
#' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}
Expand Down Expand Up @@ -33,7 +33,17 @@
#' @importFrom pROC roc
#' @export

testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests = c("neb","per","adx","sam","qua","fri","zpo","znb","vli","qpo","poi","pea","wil","ttt","ltt","ltt2","erq","erq2","ere","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){
testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20,
tests = c("bay","ds2","ds2x","per","adx","znb","zpo","msf","zig",
"erq","erq2","neb","qpo","poi","sam",
"lrm","llm","llm2","lma","lmc",
"ere","ere2","pea","spe",
"wil","kru","qua","fri",
"ttt","ltt","ltt2","tta","ttc",
"aov","lao","lao2","aoa","aoc",
"vli","lim","lli","lli2","lia","lic"),
relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1) ,rng.seed = 123,
p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){

stopifnot(exists("data"),exists("predictor"))
# Check for servers
Expand Down Expand Up @@ -74,7 +84,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests
if(length(unique(predictor)) < 2) stop("predictor should have at least two levels")

# Remove Features not present in any samples
if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
if(sum(rowSums(count_table) == 0) != 0 && verbose) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
count_table <- count_table[rowSums(count_table) > 0,]
if(nrow(count_table) <= 15) warning("Dataset contains very few features")

Expand Down Expand Up @@ -111,13 +121,13 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests
}
if(sum(k) == nrow(count_table)) stop("Set to spike all features. Can't calculate FPR or AUC. Change k argument")
if(sum(k) > nrow(count_table)) stop("Set to spike more features than are present in the data. Change k argument")
if(sum(k) < 15 & sum(k) >= 10 & R <= 10) message("Few features spiked. Increase 'k' or set 'R' to more than 10 to ensure proper estimation of AUC and FPR")
if(sum(k) < 10 & sum(k) >= 5 & R <= 20) message("Few features spiked. Increase 'k' or set 'R' to more than 20 to ensure proper estimation of AUC and FPR")
if(sum(k) < 5 & R <= 50) message("Very few features spiked. Increase 'k' or set 'R' to more than 50 to ensure proper estimation of AUC and FPR")
if(sum(k) > nrow(count_table)/2) message("Set to spike more than half of the dataset, which might give unreliable estimates, Change k argument")
if(sum(k) < 15 & sum(k) >= 10 & R <= 10) warning("Few features spiked. Increase 'k' or set 'R' to more than 10 to ensure proper estimation of AUC and FPR")
if(sum(k) < 10 & sum(k) >= 5 & R <= 20) warning("Few features spiked. Increase 'k' or set 'R' to more than 20 to ensure proper estimation of AUC and FPR")
if(sum(k) < 5 & R <= 50) warning("Very few features spiked. Increase 'k' or set 'R' to more than 50 to ensure proper estimation of AUC and FPR")
if(sum(k) > nrow(count_table)/2) warning("Set to spike more than half of the dataset, which might give unreliable estimates, Change k argument")

# predictor
if(verbose) if(any(is.na(predictor))) warning("Predictor contains NAs!")
if(any(is.na(predictor))) warning("Predictor contains NAs!")
if(is.numeric(predictor[1])){
num.pred <- TRUE
if(verbose) message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE)))
Expand Down Expand Up @@ -279,17 +289,17 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests
cat("\n")
if(length(unique(gsub(".*_","",names(results)))) != length(tests)){
if(length(tests) - length(unique(gsub(".*_","",names(results)))) == 1){
message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"was excluded due to failure"))
if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"was excluded due to failure"))
} else {
message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"were excluded due to failure"))
if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"were excluded due to failure"))
}

# Produce informative messages
if(all(tests[!tests %in% unique(gsub(".*_","",names(results)))] == "sam")){
if(verbose) message("sam usually fails if some samples has too many zeroes")
}
if(all(c("sam","ere2","erq2","ds2x") %in% tests[!tests %in% unique(gsub(".*_","",names(results)))])){
if(verbose) message("These tests usually fails if all features contain at least one zero")
if(verbose) message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero")
}

tests <- unique(gsub(".*_","",names(results)))
Expand Down

0 comments on commit e1b7f64

Please sign in to comment.