New version - update samr

Russel88 · Oct 17, 2017 · 3049174 · 3049174
1 parent 291ace7
commit 3049174
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 40 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: DAtest
 Title: Comparing Differential Abundance methods
-Version: 2.6.3
+Version: 2.6.4
 Authors@R: person("Jakob", "Russel", email = "[email protected]", role = c("aut", "cre"))
 Description: What the title says.
 Depends: R (>= 3.2.5)

diff --git a/R/testDA.R b/R/testDA.R
@@ -244,7 +244,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
 
   # Run the tests in parallel
   results <- foreach(i = tests.par , .options.snow = opts) %dopar% {
-    
+
     t1.sub <- proc.time()
 
     # Extract run info
@@ -385,15 +385,18 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
         if(min.sig == Inf) min.sig <- max(samdf$Score)+1
         samdf$pval <- 1/samdf$Score * 0.05/(1/min.sig)
       } else {
-        suppressWarnings(min.up <- min(samdf[samdf$Sig.up == "Yes","Score"]))
-        suppressWarnings(max.lo <- max(samdf[samdf$Sig.lo == "Yes","Score"]))
-        if(min.up == Inf) min.up <- max(samdf$Score)+1
-        if(max.lo == -Inf) max.lo <- min(samdf$Score)-1
-        sam.inv.up <- 1/samdf$Score * 0.05/(1/min.up)
-        sam.inv.lo <- 1/samdf$Score * 0.05/(1/max.lo)
-        samdf$pval <- apply(cbind(sam.inv.up,sam.inv.lo),1,max)
+        samdf$ScoreRank <- rank(samdf$Score)
+        suppressWarnings(min.up <- min(samdf[samdf$Sig.up == "Yes","ScoreRank"]))
+        suppressWarnings(max.lo <- max(samdf[samdf$Sig.lo == "Yes","ScoreRank"]))
+        if(effectSize >= 1){
+          if(min.up == Inf) min.up <- max(samdf$ScoreRank)+1
+          samdf$pval <- 1/samdf$ScoreRank * 0.05/(1/min.up)
+        } 
+        if(effectSize < 1){
+          if(max.lo == -Inf) max.lo <- 0.5
+          samdf$pval <- samdf$ScoreRank * 0.05/max.lo
+        } 
       }
-
       samdf$pval.adj <- samdf$pval
       res.sub[paste0(r,"_","sam")] <- NULL
       res.names <- names(res.sub)

diff --git a/R/zzz.R b/R/zzz.R
@@ -1,6 +1,6 @@
 
 .onLoad <- function(libname, pkgname){
-  message("DAtest version 2.6.3")
+  message("DAtest version 2.6.4")
 
   # Fix samr problem
   #if(.Platform$OS.type == "windows"){

diff --git a/README.md b/README.md
@@ -252,10 +252,10 @@ sum of the samples for "ttt", "ltt", "ltt2", "wil", "per", "lrm", "llm",
 "llm2", "lim", "lli", "lli2", "pea", "spe", "aov", "lao", "lao2" and
 "kru", and there will NOT be an offset of log(librarySize) for "neb",
 "poi", "qpo", "zpo" and "znb". Furthermore, tests that are specifically
-designed to handle the problem of compositionality are turned off, e.g.
-DESeq2, EdgeR, MetagenomeSeq, ANCOM, RAIDA, ALDEx2 and baySeq.
-Therefore, the `data` is analysed as provided, except for the tests that
-log-transform the data: "ltt", "llm", "lli" and "lao".
+designed to handle sequence data are turned off: DESeq2, EdgeR,
+MetagenomeSeq, ANCOM, RAIDA, ALDEx2 and baySeq. Therefore, the `data` is
+analysed as provided, except for the tests that log-transform the data:
+"ltt", "llm", "lli" and "lao".
 
 ### *If you have covariates:*
 
@@ -284,24 +284,6 @@ the same order as columns in `data`):
     # Average run times of each method:
     mytest$run.times
 
-It is advised to also choose a method based on the p-value histograms of
-the methods. Many methods will likely have almost similar AUCs, and
-looking at the p-value histograms can help to differentiate the methods.
-This will plot p-value histograms for all the non-spiked features:
-
-    plot(mytest, p = TRUE)
-
-This distribution should theoretically be uniform (flat) (except for the
-permutation test due its time-saving algorithm). Multiple-correction
-methods for controlling the false discovery rate, e.g. "fdr" / "BH",
-assume a uniform distribution (given all null hypotheses are true), and
-will not work properly if the p-value distribution deviates greatly from
-uniform. The calculations of AUC does not take this fact into account.
-
-Check out [this nice
-blog](http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
-if you want to know more on how to interpret these histograms.
-
 **Note:**
 
 As ANCOM and SAMseq do not output p-values, AUC and Spike.detect.rate
@@ -311,13 +293,14 @@ detection/significance calling: Pseudo p-value = the inverse
 statistic/score, normalized such that, of the detected ("significant")
 features, the feature with the lowest statistic/score has a pseudo
 p-value = 0.05. Higher statistic/score gives lower pseudo p-value and
-vice versa. For SAMseq it is done seperately on the positive and
-negative scores. FPR is also based on pseudo p-values for "anc" and
-"sam", but as these cannot be adjusted as nominal p-values, FPR for
-these methods is the final false discovery rate and we should expect an
-FPR = 0 for these two methods, unless you are willing to accept some
-false positives. This can be tuned with the `sig` ("anc") and
-`fdr.output` ("sam") arguments.
+vice versa. For SAMseq, it is done on the ranked scores, and if
+effectSize is below 1 the scores are not inversed such that high
+negative scores gives low pseudo p-values. FPR is also based on pseudo
+p-values for "anc" and "sam", but as these cannot be adjusted as nominal
+p-values, FPR for these methods is the final false discovery rate and we
+should expect an FPR = 0 for these two methods, unless you are willing
+to accept some false positives. This can be tuned with the `sig` ("anc")
+and `fdr.output` ("sam") arguments.
 
 P-values for baySeq are defined as 1 - posterior likelihoods.
 
@@ -611,6 +594,10 @@ argument, it will try to run a t-test and fail miserably.
 
     View(mytest$results[[1]]["ere"])
 
+#### Plot p-value histograms for all the non-spiked features:
+
+    plot(mytest, p = TRUE)
+
 ### Passing arguments to the different tests
 
 Additional arguments can be passed to the internal functions with the