From e1b7f64062d50fe2fcec435d6c2129ae152a6307 Mon Sep 17 00:00:00 2001 From: Jakob Russel Date: Thu, 14 Feb 2019 11:51:12 +0100 Subject: [PATCH] small allDA/testDA speedup, small man changes, extra allDA arg --- R/allDA.R | 47 +++++++++++++++++++++++++++++++---------------- R/powerDA.R | 2 +- R/runtimeDA.R | 2 +- R/testDA.R | 34 ++++++++++++++++++++++------------ 4 files changed, 55 insertions(+), 30 deletions(-) diff --git a/R/allDA.R b/R/allDA.R index c4e8e32..7377347 100644 --- a/R/allDA.R +++ b/R/allDA.R @@ -2,8 +2,8 @@ #' #' Run many differential abundance and expression tests at a time, to easily compare their results #' -#' mva and bay are excluded by default, as they often are slow. -#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames +#' mva is excluded by default, as it is slow. +#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses #' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} @@ -16,6 +16,7 @@ #' @param out.all If TRUE models will output results and p-values from \code{anova}/\code{drop1}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise #' @param alpha q-value threshold for calling significance. Default 0.1 #' @param core.check If TRUE (default) will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers. +#' @param verbose If TRUE will print informative messages #' @return A list of results: #' \itemize{ #' \item raw - A data.frame with raw p-values from all methods @@ -27,7 +28,17 @@ #' #' @export -allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb","per","adx","sam","qua","fri","znb","zpo","vli","qpo","poi","pea","spe","wil","ttt","ltt","ltt2","erq","ere","erq2","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE){ +allDA <- function(data, predictor, paired = NULL, covars = NULL, + tests = c("bay","ds2","ds2x","per","adx","znb","zpo","msf","zig", + "erq","erq2","neb","qpo","poi","sam", + "lrm","llm","llm2","lma","lmc", + "ere","ere2","pea","spe", + "wil","kru","qua","fri", + "ttt","ltt","ltt2","tta","ttc", + "aov","lao","lao2","aoa","aoc", + "vli","lim","lli","lli2","lia","lic"), + relative = TRUE, cores = (detectCores()-1), rng.seed = 123, + p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){ stopifnot(exists("data"),exists("predictor")) # Check for servers @@ -49,6 +60,10 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" count_table <- data } + # Coerce data + if(!is.null(paired)) paired <- as.factor(paired) + count_table <- as.data.frame(count_table) + # Checks if(relative) if(!isTRUE(all(unlist(count_table) == floor(unlist(count_table))))) stop("Count_table must only contain integer values when relative=TRUE") if(min(count_table) < 0) stop("Count_table contains negative values!") @@ -57,7 +72,7 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" if(length(unique(predictor)) < 2) stop("predictor should have at least two levels") # Remove Features not present in any samples - if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) + if(sum(rowSums(count_table) == 0) != 0 && verbose) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) count_table <- count_table[rowSums(count_table) > 0,] # Prune tests argument @@ -69,24 +84,24 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" if(length(tests) == 0) stop("No tests to run!") # Set seed - message(paste("Seed is set to",rng.seed)) + if(verbose) message(paste("Seed is set to",rng.seed)) set.seed(rng.seed) - message(paste("Running on",cores,"cores")) + if(verbose) message(paste("Running on",cores,"cores")) # predictor if(any(is.na(predictor))) warning("Predictor contains NAs!") if(is.numeric(predictor)){ - message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE))) + if(verbose) message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE))) if(length(levels(as.factor(predictor))) == 2){ ANSWER <- readline("The predictor is quantitative, but only contains 2 unique values. Are you sure this is correct? Enter y to proceed ") if(ANSWER != "y") stop("Wrap the predictor with as.factor(predictor) to treat it is a categorical variable") } } else { if(length(levels(as.factor(predictor))) > length(unique(predictor))) stop("predictor has more levels than unique values!") - message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", "))) + if(verbose) message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", "))) } if(!is.null(paired)){ - message(paste("The paired variable has",length(unique(paired)),"levels")) + if(verbose) message(paste("The paired variable has",length(unique(paired)),"levels")) if(length(unique(paired)) < 5) warning("The paired variable has less than 5 levels. Mixed-effect models are excluded") } @@ -102,9 +117,9 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" for(i in 1:length(covars)){ if(any(is.na(covars[[i]]))) warning(names(covars)[i],"contains NAs!") if(is.numeric(covars[[i]][1])){ - message(paste(names(covars)[i],"is assumed to be a quantitative variable, ranging from",min(covars[[i]], na.rm = TRUE),"to",max(covars[[i]], na.rm = TRUE))) + if(verbose) message(paste(names(covars)[i],"is assumed to be a quantitative variable, ranging from",min(covars[[i]], na.rm = TRUE),"to",max(covars[[i]], na.rm = TRUE))) } else { - message(paste(names(covars)[i],"is assumed to be a categorical variable with",length(unique(covars[[i]])),"levels:",paste(levels(as.factor(covars[[i]])),collapse = ", "))) + if(verbose) message(paste(names(covars)[i],"is assumed to be a categorical variable with",length(unique(covars[[i]])),"levels:",paste(levels(as.factor(covars[[i]])),collapse = ", "))) } } } @@ -117,7 +132,7 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" } # Run tests - cat(paste("Running",length(tests),"methods...\n")) + if(verbose) cat(paste("Running",length(tests),"methods...\n")) # Progress bar pb <- txtProgressBar(max = length(tests), style = 3) progress <- function(n) setTxtProgressBar(pb, n) @@ -207,17 +222,17 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb" results <- results[!sapply(results,is.null)] if(length(names(results)) != length(tests)){ if(length(tests) - length(names(results)) == 1){ - message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"was excluded due to failure")) + if(verbose) message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"was excluded due to failure")) } else { - message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"were excluded due to failure")) + if(verbose) message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"were excluded due to failure")) } # Produce informative messages if(all(tests[!tests %in% unique(gsub(".*_","",names(results)))] == "sam")){ - message("sam usually fails if some samples has too many zeroes") + if(verbose) message("sam usually fails if some samples has too many zeroes") } if(all(c("sam","ere2","erq2","ds2x") %in% tests[!tests %in% unique(gsub(".*_","",names(results)))])){ - message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero") + if(verbose) message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero") } tests <- names(results) diff --git a/R/powerDA.R b/R/powerDA.R index 55dd67e..f8ba198 100644 --- a/R/powerDA.R +++ b/R/powerDA.R @@ -1,7 +1,7 @@ #' Estimating (empirical) statistical power #' #' Estimating (empirical) statistical power for a specific differential abundance and expression method on a specific dataset -#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames +#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses #' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} diff --git a/R/runtimeDA.R b/R/runtimeDA.R index 04be204..b4b686c 100644 --- a/R/runtimeDA.R +++ b/R/runtimeDA.R @@ -20,7 +20,7 @@ #' @export runtimeDA <- function(data, predictor, paired = NULL, covars = NULL, subsamples = c(500,1000,1500,2000), subsamples.slow = c(100,150,200,250), tests = c("sam", "qua", "fri", "vli", "qpo", "pea", "wil", "ttt", "ltt", "ltt2","ere", "ere2", "msf", "zig", "lim", "lli", "lli2", "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe", "aoa", "aoc", "tta", "ttc", "lma", "lmc", "lia", "lic"), - tests.slow = c("mva", "neb", "bay", "per", "zpo", "znb", "adx", "ds2", "ds2x", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){ + tests.slow = c("mva", "neb", "bay", "per", "ds2", "ds2x", "zpo", "znb", "adx", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){ stopifnot(exists("data"),exists("predictor")) diff --git a/R/testDA.R b/R/testDA.R index d4088e4..b8e7c53 100644 --- a/R/testDA.R +++ b/R/testDA.R @@ -2,8 +2,8 @@ #' #' Calculating Power, False Discovery Rates, False Positive Rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods #' -#' mva and bay are excluded by default, as they often are slow. -#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames +#' mva is excluded by default, as it is slow. +#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses #' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)} @@ -33,7 +33,17 @@ #' @importFrom pROC roc #' @export -testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests = c("neb","per","adx","sam","qua","fri","zpo","znb","vli","qpo","poi","pea","wil","ttt","ltt","ltt2","erq","erq2","ere","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){ +testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, + tests = c("bay","ds2","ds2x","per","adx","znb","zpo","msf","zig", + "erq","erq2","neb","qpo","poi","sam", + "lrm","llm","llm2","lma","lmc", + "ere","ere2","pea","spe", + "wil","kru","qua","fri", + "ttt","ltt","ltt2","tta","ttc", + "aov","lao","lao2","aoa","aoc", + "vli","lim","lli","lli2","lia","lic"), + relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1) ,rng.seed = 123, + p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){ stopifnot(exists("data"),exists("predictor")) # Check for servers @@ -74,7 +84,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests if(length(unique(predictor)) < 2) stop("predictor should have at least two levels") # Remove Features not present in any samples - if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) + if(sum(rowSums(count_table) == 0) != 0 && verbose) message(paste(sum(rowSums(count_table) == 0),"empty features removed")) count_table <- count_table[rowSums(count_table) > 0,] if(nrow(count_table) <= 15) warning("Dataset contains very few features") @@ -111,13 +121,13 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests } if(sum(k) == nrow(count_table)) stop("Set to spike all features. Can't calculate FPR or AUC. Change k argument") if(sum(k) > nrow(count_table)) stop("Set to spike more features than are present in the data. Change k argument") - if(sum(k) < 15 & sum(k) >= 10 & R <= 10) message("Few features spiked. Increase 'k' or set 'R' to more than 10 to ensure proper estimation of AUC and FPR") - if(sum(k) < 10 & sum(k) >= 5 & R <= 20) message("Few features spiked. Increase 'k' or set 'R' to more than 20 to ensure proper estimation of AUC and FPR") - if(sum(k) < 5 & R <= 50) message("Very few features spiked. Increase 'k' or set 'R' to more than 50 to ensure proper estimation of AUC and FPR") - if(sum(k) > nrow(count_table)/2) message("Set to spike more than half of the dataset, which might give unreliable estimates, Change k argument") + if(sum(k) < 15 & sum(k) >= 10 & R <= 10) warning("Few features spiked. Increase 'k' or set 'R' to more than 10 to ensure proper estimation of AUC and FPR") + if(sum(k) < 10 & sum(k) >= 5 & R <= 20) warning("Few features spiked. Increase 'k' or set 'R' to more than 20 to ensure proper estimation of AUC and FPR") + if(sum(k) < 5 & R <= 50) warning("Very few features spiked. Increase 'k' or set 'R' to more than 50 to ensure proper estimation of AUC and FPR") + if(sum(k) > nrow(count_table)/2) warning("Set to spike more than half of the dataset, which might give unreliable estimates, Change k argument") # predictor - if(verbose) if(any(is.na(predictor))) warning("Predictor contains NAs!") + if(any(is.na(predictor))) warning("Predictor contains NAs!") if(is.numeric(predictor[1])){ num.pred <- TRUE if(verbose) message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE))) @@ -279,9 +289,9 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests cat("\n") if(length(unique(gsub(".*_","",names(results)))) != length(tests)){ if(length(tests) - length(unique(gsub(".*_","",names(results)))) == 1){ - message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"was excluded due to failure")) + if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"was excluded due to failure")) } else { - message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"were excluded due to failure")) + if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"were excluded due to failure")) } # Produce informative messages @@ -289,7 +299,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests if(verbose) message("sam usually fails if some samples has too many zeroes") } if(all(c("sam","ere2","erq2","ds2x") %in% tests[!tests %in% unique(gsub(".*_","",names(results)))])){ - if(verbose) message("These tests usually fails if all features contain at least one zero") + if(verbose) message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero") } tests <- unique(gsub(".*_","",names(results)))