From e1b7f64062d50fe2fcec435d6c2129ae152a6307 Mon Sep 17 00:00:00 2001
From: Jakob Russel <cmf483@science.domain>
Date: Thu, 14 Feb 2019 11:51:12 +0100
Subject: [PATCH] small allDA/testDA speedup, small man changes, extra allDA
 arg

---
 R/allDA.R     | 47 +++++++++++++++++++++++++++++++----------------
 R/powerDA.R   |  2 +-
 R/runtimeDA.R |  2 +-
 R/testDA.R    | 34 ++++++++++++++++++++++------------
 4 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/R/allDA.R b/R/allDA.R
index c4e8e32..7377347 100644
--- a/R/allDA.R
+++ b/R/allDA.R
@@ -2,8 +2,8 @@
 #'
 #' Run many differential abundance and expression tests at a time, to easily compare their results
 #' 
-#' mva and bay are excluded by default, as they often are slow.
-#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
+#' mva is excluded by default, as it is slow.
+#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
 #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses
 #' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.
 #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}
@@ -16,6 +16,7 @@
 #' @param out.all If TRUE models will output results and p-values from \code{anova}/\code{drop1}. If FALSE will output results for 2. level of the \code{predictor}. If NULL (default) set as TRUE for multi-class \code{predictor} and FALSE otherwise
 #' @param alpha q-value threshold for calling significance. Default 0.1
 #' @param core.check If TRUE (default) will make an interactive check that the amount of cores specified are desired. Only if \code{cores>20}. This is to ensure that the function doesn't automatically overloads a server with workers.  
+#' @param verbose If TRUE will print informative messages
 #' @return A list of results:
 #' \itemize{
 #'  \item raw - A data.frame with raw p-values from all methods
@@ -27,7 +28,17 @@
 #' 
 #' @export
 
-allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb","per","adx","sam","qua","fri","znb","zpo","vli","qpo","poi","pea","spe","wil","ttt","ltt","ltt2","erq","ere","erq2","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE){
+allDA <- function(data, predictor, paired = NULL, covars = NULL,
+                  tests = c("bay","ds2","ds2x","per","adx","znb","zpo","msf","zig",
+                            "erq","erq2","neb","qpo","poi","sam",
+                            "lrm","llm","llm2","lma","lmc",
+                            "ere","ere2","pea","spe",
+                            "wil","kru","qua","fri",
+                            "ttt","ltt","ltt2","tta","ttc",
+                            "aov","lao","lao2","aoa","aoc",
+                            "vli","lim","lli","lli2","lia","lic"),
+                  relative = TRUE, cores = (detectCores()-1), rng.seed = 123,
+                  p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){
 
   stopifnot(exists("data"),exists("predictor"))
   # Check for servers
@@ -49,6 +60,10 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
     count_table <- data
   }
 
+  # Coerce data
+  if(!is.null(paired)) paired <- as.factor(paired)
+  count_table <- as.data.frame(count_table)
+
   # Checks
   if(relative) if(!isTRUE(all(unlist(count_table) == floor(unlist(count_table))))) stop("Count_table must only contain integer values when relative=TRUE")
   if(min(count_table) < 0) stop("Count_table contains negative values!")
@@ -57,7 +72,7 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
   if(length(unique(predictor)) < 2) stop("predictor should have at least two levels")
   
   # Remove Features not present in any samples
-  if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
+  if(sum(rowSums(count_table) == 0) != 0 && verbose) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
   count_table <- count_table[rowSums(count_table) > 0,]
   
   # Prune tests argument
@@ -69,24 +84,24 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
   if(length(tests) == 0) stop("No tests to run!")
   
   # Set seed
-  message(paste("Seed is set to",rng.seed))
+  if(verbose) message(paste("Seed is set to",rng.seed))
   set.seed(rng.seed)
-  message(paste("Running on",cores,"cores"))
+  if(verbose) message(paste("Running on",cores,"cores"))
   
   # predictor
   if(any(is.na(predictor))) warning("Predictor contains NAs!")
   if(is.numeric(predictor)){
-    message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE)))
+    if(verbose)  message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE)))
     if(length(levels(as.factor(predictor))) == 2){
       ANSWER <- readline("The predictor is quantitative, but only contains 2 unique values. Are you sure this is correct? Enter y to proceed ")
       if(ANSWER != "y") stop("Wrap the predictor with as.factor(predictor) to treat it is a categorical variable")
     }
   } else {
     if(length(levels(as.factor(predictor))) > length(unique(predictor))) stop("predictor has more levels than unique values!")
-    message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", ")))
+    if(verbose) message(paste("predictor is assumed to be a categorical variable with",length(unique(predictor)),"levels:",paste(levels(as.factor(predictor)),collapse = ", ")))
   }
   if(!is.null(paired)){
-    message(paste("The paired variable has",length(unique(paired)),"levels"))
+    if(verbose) message(paste("The paired variable has",length(unique(paired)),"levels"))
     if(length(unique(paired)) < 5) warning("The paired variable has less than 5 levels. Mixed-effect models are excluded")
   }
 
@@ -102,9 +117,9 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
     for(i in 1:length(covars)){
       if(any(is.na(covars[[i]]))) warning(names(covars)[i],"contains NAs!")
       if(is.numeric(covars[[i]][1])){
-        message(paste(names(covars)[i],"is assumed to be a quantitative variable, ranging from",min(covars[[i]], na.rm = TRUE),"to",max(covars[[i]], na.rm = TRUE)))
+        if(verbose) message(paste(names(covars)[i],"is assumed to be a quantitative variable, ranging from",min(covars[[i]], na.rm = TRUE),"to",max(covars[[i]], na.rm = TRUE)))
       } else {
-        message(paste(names(covars)[i],"is assumed to be a categorical variable with",length(unique(covars[[i]])),"levels:",paste(levels(as.factor(covars[[i]])),collapse = ", ")))
+        if(verbose) message(paste(names(covars)[i],"is assumed to be a categorical variable with",length(unique(covars[[i]])),"levels:",paste(levels(as.factor(covars[[i]])),collapse = ", ")))
       }
     }
   }
@@ -117,7 +132,7 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
   }
   
   # Run tests
-  cat(paste("Running",length(tests),"methods...\n"))
+  if(verbose) cat(paste("Running",length(tests),"methods...\n"))
   # Progress bar
   pb <- txtProgressBar(max = length(tests), style = 3)
   progress <- function(n) setTxtProgressBar(pb, n)
@@ -207,17 +222,17 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("neb"
   results <- results[!sapply(results,is.null)]
   if(length(names(results)) != length(tests)){
     if(length(tests) - length(names(results)) == 1){
-      message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"was excluded due to failure"))
+      if(verbose) message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"was excluded due to failure"))
     } else {
-      message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"were excluded due to failure"))
+      if(verbose) message(paste(paste(tests[!tests %in% names(results)],collapse = ", "),"were excluded due to failure"))
     }
     
     # Produce informative messages
     if(all(tests[!tests %in% unique(gsub(".*_","",names(results)))] == "sam")){
-      message("sam usually fails if some samples has too many zeroes")
+      if(verbose) message("sam usually fails if some samples has too many zeroes")
     }
     if(all(c("sam","ere2","erq2","ds2x") %in% tests[!tests %in% unique(gsub(".*_","",names(results)))])){
-      message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero")
+      if(verbose) message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero")
     }
     
     tests <- names(results)
diff --git a/R/powerDA.R b/R/powerDA.R
index 55dd67e..f8ba198 100644
--- a/R/powerDA.R
+++ b/R/powerDA.R
@@ -1,7 +1,7 @@
 #' Estimating (empirical) statistical power
 #'
 #' Estimating (empirical) statistical power for a specific differential abundance and expression method on a specific dataset
-#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
+#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
 #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses
 #' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.
 #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}
diff --git a/R/runtimeDA.R b/R/runtimeDA.R
index 04be204..b4b686c 100644
--- a/R/runtimeDA.R
+++ b/R/runtimeDA.R
@@ -20,7 +20,7 @@
 #' @export
 runtimeDA <- function(data, predictor, paired = NULL, covars = NULL, subsamples = c(500,1000,1500,2000), subsamples.slow = c(100,150,200,250), 
                       tests =  c("sam", "qua", "fri", "vli", "qpo", "pea", "wil", "ttt", "ltt", "ltt2","ere", "ere2", "msf", "zig", "lim", "lli", "lli2", "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe", "aoa", "aoc", "tta", "ttc", "lma", "lmc", "lia", "lic"), 
-                      tests.slow = c("mva", "neb", "bay", "per", "zpo", "znb", "adx", "ds2", "ds2x", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){
+                      tests.slow = c("mva", "neb", "bay", "per", "ds2", "ds2x", "zpo", "znb", "adx", "poi", "erq", "erq2"), cores = (detectCores()-1), ...){
   
   stopifnot(exists("data"),exists("predictor"))
 
diff --git a/R/testDA.R b/R/testDA.R
index d4088e4..b8e7c53 100644
--- a/R/testDA.R
+++ b/R/testDA.R
@@ -2,8 +2,8 @@
 #'
 #' Calculating Power, False Discovery Rates, False Positive Rates and AUC (Area Under the Receiver Operating Characteristic (ROC) Curve) for various differential abundance and expression methods
 #' 
-#' mva and bay are excluded by default, as they often are slow.
-#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
+#' mva is excluded by default, as it is slow.
+#' @param data Either a data.frame with counts/abundances, OR a \code{phyloseq} object. If a data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
 #' @param predictor The predictor of interest. Either a Factor or Numeric, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation. If the \code{predictor} is numeric it will be treated as such in the analyses
 #' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if \code{data} is a \code{phyloseq} object the name of the variable in \code{sample_data(data)} in quotation.
 #' @param covars Either a named list with covariates, OR if \code{data} is a \code{phyloseq} object a character vector with names of the variables in \code{sample_data(data)}
@@ -33,7 +33,17 @@
 #' @importFrom pROC roc
 #' @export
 
-testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests = c("neb","per","adx","sam","qua","fri","zpo","znb","vli","qpo","poi","pea","wil","ttt","ltt","ltt2","erq","erq2","ere","ere2","msf","zig","ds2","ds2x","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe","tta","ttc","aoa","aoc","lma","lmc","lia","lic"), relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){
+testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20,
+                   tests = c("bay","ds2","ds2x","per","adx","znb","zpo","msf","zig",
+                             "erq","erq2","neb","qpo","poi","sam",
+                             "lrm","llm","llm2","lma","lmc",
+                             "ere","ere2","pea","spe",
+                             "wil","kru","qua","fri",
+                             "ttt","ltt","ltt2","tta","ttc",
+                             "aov","lao","lao2","aoa","aoc",
+                             "vli","lim","lli","lli2","lia","lic"),
+                   relative = TRUE, effectSize = 5, k = NULL, cores = (detectCores()-1) ,rng.seed = 123,
+                   p.adj = "fdr", args = list(), out.all = NULL, alpha = 0.1, core.check = TRUE, verbose = TRUE){
 
   stopifnot(exists("data"),exists("predictor"))
   # Check for servers
@@ -74,7 +84,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests
   if(length(unique(predictor)) < 2) stop("predictor should have at least two levels")
   
   # Remove Features not present in any samples
-  if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
+  if(sum(rowSums(count_table) == 0) != 0 && verbose) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
   count_table <- count_table[rowSums(count_table) > 0,]
   if(nrow(count_table) <= 15) warning("Dataset contains very few features") 
   
@@ -111,13 +121,13 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests
   } 
   if(sum(k) == nrow(count_table)) stop("Set to spike all features. Can't calculate FPR or AUC. Change k argument")
   if(sum(k) > nrow(count_table)) stop("Set to spike more features than are present in the data. Change k argument")
-  if(sum(k) < 15 & sum(k) >= 10 & R <= 10) message("Few features spiked. Increase 'k' or set 'R' to more than 10 to ensure proper estimation of AUC and FPR")
-  if(sum(k) < 10 & sum(k) >= 5 & R <= 20) message("Few features spiked. Increase 'k' or set 'R' to more than 20 to ensure proper estimation of AUC and FPR")                                  
-  if(sum(k) < 5 & R <= 50) message("Very few features spiked. Increase 'k' or set 'R' to more than 50 to ensure proper estimation of AUC and FPR")
-  if(sum(k) > nrow(count_table)/2) message("Set to spike more than half of the dataset, which might give unreliable estimates, Change k argument")   
+  if(sum(k) < 15 & sum(k) >= 10 & R <= 10) warning("Few features spiked. Increase 'k' or set 'R' to more than 10 to ensure proper estimation of AUC and FPR")
+  if(sum(k) < 10 & sum(k) >= 5 & R <= 20) warning("Few features spiked. Increase 'k' or set 'R' to more than 20 to ensure proper estimation of AUC and FPR")
+  if(sum(k) < 5 & R <= 50) warning("Very few features spiked. Increase 'k' or set 'R' to more than 50 to ensure proper estimation of AUC and FPR")
+  if(sum(k) > nrow(count_table)/2) warning("Set to spike more than half of the dataset, which might give unreliable estimates, Change k argument")
                                  
   # predictor
-  if(verbose) if(any(is.na(predictor))) warning("Predictor contains NAs!")
+  if(any(is.na(predictor))) warning("Predictor contains NAs!")
   if(is.numeric(predictor[1])){
     num.pred <- TRUE
     if(verbose) message(paste("predictor is assumed to be a quantitative variable, ranging from",min(predictor, na.rm = TRUE),"to",max(predictor, na.rm = TRUE)))
@@ -279,9 +289,9 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests
   cat("\n")
   if(length(unique(gsub(".*_","",names(results)))) != length(tests)){
     if(length(tests) - length(unique(gsub(".*_","",names(results)))) == 1){
-      message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"was excluded due to failure"))
+      if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"was excluded due to failure"))
     } else {
-      message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"were excluded due to failure"))
+      if(verbose) message(paste(paste(tests[!tests %in% unique(gsub(".*_","",names(results)))],collapse = ", "),"were excluded due to failure"))
     }
 
     # Produce informative messages
@@ -289,7 +299,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 20, tests
       if(verbose) message("sam usually fails if some samples has too many zeroes")
     }
     if(all(c("sam","ere2","erq2","ds2x") %in% tests[!tests %in% unique(gsub(".*_","",names(results)))])){
-      if(verbose) message("These tests usually fails if all features contain at least one zero")
+      if(verbose) message("sam, ere2, erq2 and ds2x usually fails if all features contain at least one zero")
     }
       
     tests <- unique(gsub(".*_","",names(results)))