From 3049174ea8e53c26e630a9d2c57cc218d842f271 Mon Sep 17 00:00:00 2001
From: Jakob Russel <cmf483@science.domain>
Date: Tue, 17 Oct 2017 16:49:19 +0200
Subject: [PATCH] New version - update samr

---
 DESCRIPTION |  2 +-
 R/testDA.R  | 21 ++++++++++++---------
 R/zzz.R     |  2 +-
 README.md   | 45 ++++++++++++++++-----------------------------
 4 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index cc57755..ee4314d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: DAtest
 Title: Comparing Differential Abundance methods
-Version: 2.6.3
+Version: 2.6.4
 Authors@R: person("Jakob", "Russel", email = "russel2620@gmail.com", role = c("aut", "cre"))
 Description: What the title says.
 Depends: R (>= 3.2.5)
diff --git a/R/testDA.R b/R/testDA.R
index 24fcac3..6608116 100644
--- a/R/testDA.R
+++ b/R/testDA.R
@@ -244,7 +244,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
 
   # Run the tests in parallel
   results <- foreach(i = tests.par , .options.snow = opts) %dopar% {
-    
+
     t1.sub <- proc.time()
     
     # Extract run info
@@ -385,15 +385,18 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
         if(min.sig == Inf) min.sig <- max(samdf$Score)+1
         samdf$pval <- 1/samdf$Score * 0.05/(1/min.sig)
       } else {
-        suppressWarnings(min.up <- min(samdf[samdf$Sig.up == "Yes","Score"]))
-        suppressWarnings(max.lo <- max(samdf[samdf$Sig.lo == "Yes","Score"]))
-        if(min.up == Inf) min.up <- max(samdf$Score)+1
-        if(max.lo == -Inf) max.lo <- min(samdf$Score)-1
-        sam.inv.up <- 1/samdf$Score * 0.05/(1/min.up)
-        sam.inv.lo <- 1/samdf$Score * 0.05/(1/max.lo)
-        samdf$pval <- apply(cbind(sam.inv.up,sam.inv.lo),1,max)
+        samdf$ScoreRank <- rank(samdf$Score)
+        suppressWarnings(min.up <- min(samdf[samdf$Sig.up == "Yes","ScoreRank"]))
+        suppressWarnings(max.lo <- max(samdf[samdf$Sig.lo == "Yes","ScoreRank"]))
+        if(effectSize >= 1){
+          if(min.up == Inf) min.up <- max(samdf$ScoreRank)+1
+          samdf$pval <- 1/samdf$ScoreRank * 0.05/(1/min.up)
+        } 
+        if(effectSize < 1){
+          if(max.lo == -Inf) max.lo <- 0.5
+          samdf$pval <- samdf$ScoreRank * 0.05/max.lo
+        } 
       }
-      
       samdf$pval.adj <- samdf$pval
       res.sub[paste0(r,"_","sam")] <- NULL
       res.names <- names(res.sub)
diff --git a/R/zzz.R b/R/zzz.R
index 73b40ed..b571746 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -1,6 +1,6 @@
 
 .onLoad <- function(libname, pkgname){
-  message("DAtest version 2.6.3")
+  message("DAtest version 2.6.4")
   
   # Fix samr problem
   #if(.Platform$OS.type == "windows"){
diff --git a/README.md b/README.md
index e6d5b34..6d8202e 100644
--- a/README.md
+++ b/README.md
@@ -252,10 +252,10 @@ sum of the samples for "ttt", "ltt", "ltt2", "wil", "per", "lrm", "llm",
 "llm2", "lim", "lli", "lli2", "pea", "spe", "aov", "lao", "lao2" and
 "kru", and there will NOT be an offset of log(librarySize) for "neb",
 "poi", "qpo", "zpo" and "znb". Furthermore, tests that are specifically
-designed to handle the problem of compositionality are turned off, e.g.
-DESeq2, EdgeR, MetagenomeSeq, ANCOM, RAIDA, ALDEx2 and baySeq.
-Therefore, the `data` is analysed as provided, except for the tests that
-log-transform the data: "ltt", "llm", "lli" and "lao".
+designed to handle sequence data are turned off: DESeq2, EdgeR,
+MetagenomeSeq, ANCOM, RAIDA, ALDEx2 and baySeq. Therefore, the `data` is
+analysed as provided, except for the tests that log-transform the data:
+"ltt", "llm", "lli" and "lao".
 
 ### *If you have covariates:*
 
@@ -284,24 +284,6 @@ the same order as columns in `data`):
     # Average run times of each method:
     mytest$run.times
 
-It is advised to also choose a method based on the p-value histograms of
-the methods. Many methods will likely have almost similar AUCs, and
-looking at the p-value histograms can help to differentiate the methods.
-This will plot p-value histograms for all the non-spiked features:
-
-    plot(mytest, p = TRUE)
-
-This distribution should theoretically be uniform (flat) (except for the
-permutation test due its time-saving algorithm). Multiple-correction
-methods for controlling the false discovery rate, e.g. "fdr" / "BH",
-assume a uniform distribution (given all null hypotheses are true), and
-will not work properly if the p-value distribution deviates greatly from
-uniform. The calculations of AUC does not take this fact into account.
-
-Check out [this nice
-blog](http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
-if you want to know more on how to interpret these histograms.
-
 **Note:**
 
 As ANCOM and SAMseq do not output p-values, AUC and Spike.detect.rate
@@ -311,13 +293,14 @@ detection/significance calling: Pseudo p-value = the inverse
 statistic/score, normalized such that, of the detected ("significant")
 features, the feature with the lowest statistic/score has a pseudo
 p-value = 0.05. Higher statistic/score gives lower pseudo p-value and
-vice versa. For SAMseq it is done seperately on the positive and
-negative scores. FPR is also based on pseudo p-values for "anc" and
-"sam", but as these cannot be adjusted as nominal p-values, FPR for
-these methods is the final false discovery rate and we should expect an
-FPR = 0 for these two methods, unless you are willing to accept some
-false positives. This can be tuned with the `sig` ("anc") and
-`fdr.output` ("sam") arguments.
+vice versa. For SAMseq, it is done on the ranked scores, and if
+effectSize is below 1 the scores are not inversed such that high
+negative scores gives low pseudo p-values. FPR is also based on pseudo
+p-values for "anc" and "sam", but as these cannot be adjusted as nominal
+p-values, FPR for these methods is the final false discovery rate and we
+should expect an FPR = 0 for these two methods, unless you are willing
+to accept some false positives. This can be tuned with the `sig` ("anc")
+and `fdr.output` ("sam") arguments.
 
 P-values for baySeq are defined as 1 - posterior likelihoods.
 
@@ -611,6 +594,10 @@ argument, it will try to run a t-test and fail miserably.
 
     View(mytest$results[[1]]["ere"])
 
+#### Plot p-value histograms for all the non-spiked features:
+
+    plot(mytest, p = TRUE)
+
 ### Passing arguments to the different tests
 
 Additional arguments can be passed to the internal functions with the