From e4a829d5c27e7dd340aee74bdc2b511597ba42e5 Mon Sep 17 00:00:00 2001
From: a113n <allenyuchishing@gmail.com>
Date: Mon, 11 Jun 2018 00:55:46 +0100
Subject: [PATCH 01/16] Update CLASS_exstra_score.R

Fix the bug of reading groups.samples

Former-commit-id: 7837a90f28523001fe14a9adf01f589e08ab81e3
---
 R/CLASS_exstra_score.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/CLASS_exstra_score.R b/R/CLASS_exstra_score.R
index bf9b975..2fc4f1e 100644
--- a/R/CLASS_exstra_score.R
+++ b/R/CLASS_exstra_score.R
@@ -39,7 +39,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL
     }
   }
   if(!is.null(groups.samples)) {
-    # using regex for groups
+    # Specify the group of samples directly via groups.samples
     assert("groups.samples must be a list if used, with vectors with names of at least one of 'case', 'control' or 'null'.", 
       is.list(groups.samples), 
       length(groups.samples) > 0,
@@ -55,7 +55,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL
         groups_all[sample == counts$sample] <- "case"
       }      
     } else {
-      groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.regex))
+      groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.samples))
       for(group.name in names(groups.samples)) {
         for(sample in groups.samples[[group.name]]) {
           groups_all[sample == counts$sample] <- group.name
@@ -341,4 +341,4 @@ as.exstra_score <- function(x, copy = FALSE) {
     x <- copy.exstra_score(x)
   }
   structure(list(data = x$data, db = x$db, input_type = x$input_type, samples = x$samples), class = c("exstra_score", "exstra_db"))
-}
\ No newline at end of file
+}

From 3c783dda4d726fdb8bbde64d8d3954f3cae21d2b Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 22 Jun 2018 16:01:05 +0800
Subject: [PATCH 02/16] private function quant_statistic_sampp() now allows
 specific case samples to be given

Former-commit-id: 13fcc9ea8a20e9b332c4b57825f868ca671325bb
---
 R/private_functions.R | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/R/private_functions.R b/R/private_functions.R
index d275a42..8b5fc2a 100644
--- a/R/private_functions.R
+++ b/R/private_functions.R
@@ -558,11 +558,15 @@ quant_statistic <- function(qmmat, sample = 1, quant = 0.5, trim = 0.15,
   cumsum(rev(ts))[qs] / qs # we divide by the number of t statistics being summed
 }
 
-quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) {
+quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, 
+  case_samples = NULL,
+  ...) {
   # get the quantile statistic for multiple samples
   # qmmat: a quantile matrix from the make_quantiles_matrix() function
   # sample: samples to get the statistic of. If NULL, give all samples in qmmat
   # qs: keeps the top number of quantiles, you probably do not want to use this
+  # case_samples: if not NULL, only calculate for these samples in case-control setting.
+  #               Other cases are excluded in each calculation.
   # ... further arguments to quant_statistic(), most interesting is:
   #     quant keeps quantiles above its value only when qs is not specified. The default 
   #         keeps all values above the median (quant = 0.5)
@@ -575,16 +579,33 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) {
   assert("sample is not a character, numeric or null", is.null(sample) || is.character(sample) || is.numeric(sample))
   assert("qs is not numeric", is.null(qs) || is.numeric(qs))
   assert("qs is not single", is.null(qs) || length(qs) == 1)
-  if(is.null(sample)) {
-    sample <- seq_len(dim(qmmat)[1])
-  }
-  t_out <- rep(NA, length(sample))
+  
   ti <- 0
-  for(samp in sample) {
-    ti <- ti + 1
-    t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...)
+  if(!is.null(case_samples)) {
+    if(is.null(sample)) {
+      sample <- case_samples
+    }
+    assert("All case samples should be in qmmat", all(case_samples %in% rownames(qmmat)))
+    t_out <- rep(NA, length(sample) - length(case_samples))
+    for(samp in sample) {
+      ti <- ti + 1
+      #TODO: get only the correct qmmat columns
+      qmmat0 <- qmmat
+      t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs, 
+        subject_in_background = FALSE, ...)
+    }
+  } else {
+    # No samples marked explicitly as cases
+    if(is.null(sample)) {
+      sample <- seq_len(dim(qmmat)[1])
+    }
+    t_out <- rep(NA, length(sample))
+    for(samp in sample) {
+      ti <- ti + 1
+      t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...)
+    }
   }
-  names(t_out) <- rownames(qmmat)
+  names(t_out) <- sample
   t_out
 }
 

From 152afb12361454c330a9f610512013c2ce333855 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 22 Jun 2018 16:27:30 +0800
Subject: [PATCH 03/16] had to fix a bug I made with the old usage

Former-commit-id: 1adaf093ee4eee10b71dac1873e76e09be9d4758
---
 R/private_functions.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/private_functions.R b/R/private_functions.R
index 8b5fc2a..220c8f4 100644
--- a/R/private_functions.R
+++ b/R/private_functions.R
@@ -592,8 +592,9 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL,
       #TODO: get only the correct qmmat columns
       qmmat0 <- qmmat
       t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs, 
-        subject_in_background = FALSE, ...)
+        subject_in_background = FALSE, ...) # TODO: maybe trim = 0?
     }
+    names(t_out) <- sample
   } else {
     # No samples marked explicitly as cases
     if(is.null(sample)) {
@@ -604,8 +605,8 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL,
       ti <- ti + 1
       t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...)
     }
+    names(t_out) <- rownames(qmmat) # may be a bug if only some samples are required
   }
-  names(t_out) <- sample
   t_out
 }
 

From 054b6d081a749d7a8275b527426f31db0abff2fe Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 22 Jun 2018 17:07:41 +0800
Subject: [PATCH 04/16] case_control tsum_test now gives no trim. Excludes
 other samples properly now.

Former-commit-id: fc79e96594a97b4c1526b2e23fb4044f5eeb18f6
---
 R/private_functions.R |  5 +++--
 R/tsum_test.R         | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/R/private_functions.R b/R/private_functions.R
index 220c8f4..09908da 100644
--- a/R/private_functions.R
+++ b/R/private_functions.R
@@ -587,10 +587,11 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL,
     }
     assert("All case samples should be in qmmat", all(case_samples %in% rownames(qmmat)))
     t_out <- rep(NA, length(sample) - length(case_samples))
+    control_samples <- rownames(qmmat)[! rownames(qmmat) %in% case_samples]
     for(samp in sample) {
       ti <- ti + 1
-      #TODO: get only the correct qmmat columns
-      qmmat0 <- qmmat
+      # Get only the correct qmmat columns
+      qmmat0 <- qmmat[c(samp, control_samples),]
       t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs, 
         subject_in_background = FALSE, ...) # TODO: maybe trim = 0?
     }
diff --git a/R/tsum_test.R b/R/tsum_test.R
index 6930800..ea77d4f 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -8,6 +8,7 @@
 #' @param trim Trim this proportion of data points at each quantile 
 #'             level (rounded up). Must be at least 0 and less than 0.5, but values close
 #'             to 0.5 may remove all samples and hence result in an error.  
+#' @param trim.cc Trim value used in case-control analysis. Default of 0.
 #' @param min.quant Only quantiles above this value are used in constructing the statistic.  
 #' @param give.pvalue Whether to calculate the p-value. As this can be slow it can be
 #'                    useful to turn off if only the t sum statistics are required. 
@@ -16,6 +17,8 @@
 #'          decimal fractions. 
 #' @param correction Correction method of p_value() function.
 #' @param alpha Signficance level of p_value() function.
+#' @param case_control If TRUE, only calculate for samples designated cases. Otherwise
+#'                 all samples are used to calculate the background distribution.
 #' @param parallel Use the parallel package when simulating the distribution, creating the
 #'                 required cluster. 
 #'                 If cluster is specified then this option makes no difference. 
@@ -56,11 +59,13 @@
 #' @export
 tsum_test <- function(strscore, 
   trim = 0.15,
+  trim.cc = 0,
   min.quant = 0.5,
   give.pvalue = TRUE, 
   B = 9999, 
   correction = c("bf", "locus", "uncorrected"),
   alpha = 0.05,
+  case_control = FALSE, 
   parallel = FALSE, # TRUE for cluster
   cluster_n = NULL, # Cluster size if cluster == NULL. When NULL, #threads - 1 (but always at least 1)
   cluster = NULL # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a
@@ -105,7 +110,14 @@ tsum_test <- function(strscore,
     # message("Generating T sum statistics for ", loc)
     qm <- make_quantiles_matrix(strscore, loc = loc, sample = NULL, read_count_quant = 1, 
       method = "quantile7", min.n = 3)
-    T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5
+    if(case_control) {
+      # Only calculating for case samples
+      T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim.cc,
+        case_samples = strscore$samples[group == "case", sample]) 
+    } else {
+      # Using all samples as the background:
+      T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5
+    }
     T_stats_list[[loc]] <- data.table(sample = names(T_stats_loc), tsum = T_stats_loc)
   }
   T_stats <- rbindlist(T_stats_list, idcol = "locus")

From 15dbf6792cdce14e7c37aa900c9a2cf3c3fbfab5 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 22 Jun 2018 17:08:58 +0800
Subject: [PATCH 05/16] update docs for tsum_test()

Former-commit-id: ac7f77c56e052412e1acc545b24f43057f602d88
---
 man/tsum_test.Rd | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/man/tsum_test.Rd b/man/tsum_test.Rd
index 325ffd3..cbb2c7a 100644
--- a/man/tsum_test.Rd
+++ b/man/tsum_test.Rd
@@ -4,9 +4,10 @@
 \alias{tsum_test}
 \title{Generate T sum statistics and p-values from simulation.}
 \usage{
-tsum_test(strscore, trim = 0.15, min.quant = 0.5, give.pvalue = TRUE,
-  B = 9999, correction = c("bf", "locus", "uncorrected"), alpha = 0.05,
-  parallel = FALSE, cluster_n = NULL, cluster = NULL)
+tsum_test(strscore, trim = 0.15, trim.cc = 0, min.quant = 0.5,
+  give.pvalue = TRUE, B = 9999, correction = c("bf", "locus",
+  "uncorrected"), alpha = 0.05, case_control = FALSE, parallel = FALSE,
+  cluster_n = NULL, cluster = NULL)
 }
 \arguments{
 \item{strscore}{An exstra_score object.}
@@ -15,6 +16,8 @@ tsum_test(strscore, trim = 0.15, min.quant = 0.5, give.pvalue = TRUE,
 level (rounded up). Must be at least 0 and less than 0.5, but values close
 to 0.5 may remove all samples and hence result in an error.}
 
+\item{trim.cc}{Trim value used in case-control analysis. Default of 0.}
+
 \item{min.quant}{Only quantiles above this value are used in constructing the statistic.}
 
 \item{give.pvalue}{Whether to calculate the p-value. As this can be slow it can be
@@ -28,6 +31,9 @@ decimal fractions.}
 
 \item{alpha}{Signficance level of p_value() function.}
 
+\item{case_control}{If TRUE, only calculate for samples designated cases. Otherwise
+all samples are used to calculate the background distribution.}
+
 \item{parallel}{Use the parallel package when simulating the distribution, creating the
 required cluster. 
 If cluster is specified then this option makes no difference.}

From 6201628c25b2d9162bc711c392da3fbb816614a9 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 29 Jun 2018 15:58:26 +0800
Subject: [PATCH 06/16] First go at speeding up simulation results by making
 more use of each simulation

Former-commit-id: 1b65e973e3cb9ef3c287e60ddb7e8f4e063c03da
---
 R/private_functions.R | 26 +++++++++++++++++++-------
 R/tsum_test.R         |  3 ++-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/R/private_functions.R b/R/private_functions.R
index 09908da..ada2f80 100644
--- a/R/private_functions.R
+++ b/R/private_functions.R
@@ -666,6 +666,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
   parallel = FALSE, # perform in parallel
   cluster = NULL, # cluster for parallel. If NULL, then make one
   cluster_n = 4, # cluster size if cl is NULL
+  T_stat = NULL, # T_stat named vector precomputed
   ...) {
   # Derive a simulated ECDF, and return an "ecdf" class object
   # qmmat: a quantile matrix from the make_quantiles_matrix() function
@@ -750,7 +751,9 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
   }
   simulate_quant_statistic <- function() {
     simu <- simulate_quantile_matrix()
-    do.call(quant_statistic, c(list(simu), triple_dots))
+    # Devel: this next line change completely changes the output here, for efficiency
+    # do.call(quant_statistic, c(list(simu), triple_dots))
+    do.call(quant_statistic_sampp, c(list(simu), triple_dots))
   }
   
   triple_dots <- c(list(...), list(subject_in_background = subject_in_background, 
@@ -799,9 +802,11 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
     }
   } else {
     # not performing in parallel
-    sim_T <- rep(0, B) # Don't let p-value get to zero
+    # TODO here:
+    sim_T <- rep(0, B*N) # Don't let p-value get to zero
     for(i in seq_len(B)) {
-      sim_T[i] <- simulate_quant_statistic()
+      sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic()
+      # TODO: insert stopping-code here
     }
   }
   
@@ -810,12 +815,14 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
   comp_p_function <- ecdf(sim_T)
   
   # p-values
-  T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ...
+  if(is.null(T_stat)) {
+    T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ...
+  }
   # p <- 1 - comp_p_function(T_stat) + 1 / (B + 1)
   p <- rep(1.1, N)
   # calculate p-values
   for(i in seq_len(N)) {
-    p[i] = c(sum(sim_T > T_stat[i]) + 1) / (B + 1)
+    p[i] = c(sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1)
   }
   names(p) <- names(T_stat)
   
@@ -1012,7 +1019,8 @@ sd_of_trimmed <- function(...) {
 # This runs the simulation with the given paramets
 # Use ... to pass options to simulate_ecdf_quant_statistic()
 # This also passes remaining options onto quant_statistic() and quant_statistic_sampp()
-simulation_run <- function(data, B = 99, trim = 0.15, ...) {
+simulation_run <- function(data, B = 99, trim = 0.15,
+    T_stats = NULL, ...) {
   N <- data$samples[, .N]
   L <- data$db[, .N]
   p.matrix <- matrix(nrow = N, ncol = L)
@@ -1024,7 +1032,11 @@ simulation_run <- function(data, B = 99, trim = 0.15, ...) {
     message("Simulating distribution for ", loc)
     qm_loop <- make_quantiles_matrix(data, loc, read_count_quant = 1, 
       method = "quantile7", min.n = 3)
-    xec <- simulate_ecdf_quant_statistic(qm_loop, B, ...)
+    T_stat_loc <- T_stats[locus == loc, ]
+    T_stat <- T_stat_loc[, tsum]
+    names(T_stat) <- T_stat_loc[, sample]
+    xec <- simulate_ecdf_quant_statistic(qm_loop, B, 
+      T_stat = T_stat, ...)
     p.matrix[names(xec$p), loc] <- xec$p
     qmmats[[loc]] <- qm_loop
     xecs[[loc]] <- xec
diff --git a/R/tsum_test.R b/R/tsum_test.R
index ea77d4f..7b98660 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -144,7 +144,8 @@ tsum_test <- function(strscore,
         parallel = parallel, # TRUE for cluster
         cluster = cluster,
         cluster_n = cluster_n,
-        trim = trim
+        trim = trim,
+        T_stats = T_stats
       ),
       error = function(x) { 
           # list(cohort = this.cohort, p.matrix = c(0, 1), error = x)

From 1af4328c4fd182d2f714c2ae151592387d4d4cde Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 29 Jun 2018 16:40:29 +0800
Subject: [PATCH 07/16] A version where the parallel and serial time-saving
 give similar results. Also removed some computations that were not given in
 the final output of tsum_test()

Former-commit-id: 88fa131333032a26144b848dcc7d8ca15d6df89e
---
 R/private_functions.R | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/R/private_functions.R b/R/private_functions.R
index ada2f80..b888ca6 100644
--- a/R/private_functions.R
+++ b/R/private_functions.R
@@ -751,8 +751,10 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
   }
   simulate_quant_statistic <- function() {
     simu <- simulate_quantile_matrix()
-    # Devel: this next line change completely changes the output here, for efficiency
-    # do.call(quant_statistic, c(list(simu), triple_dots))
+    do.call(quant_statistic, c(list(simu), triple_dots))
+  }
+  simulate_quant_statistic_sampp <- function() {
+    simu <- simulate_quantile_matrix()
     do.call(quant_statistic_sampp, c(list(simu), triple_dots))
   }
   
@@ -805,14 +807,14 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
     # TODO here:
     sim_T <- rep(0, B*N) # Don't let p-value get to zero
     for(i in seq_len(B)) {
-      sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic()
+      sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic_sampp()
       # TODO: insert stopping-code here
     }
   }
   
   
-  
-  comp_p_function <- ecdf(sim_T)
+  # Following is not relevant to output
+  # comp_p_function <- ecdf(sim_T)
   
   # p-values
   if(is.null(T_stat)) {
@@ -822,12 +824,13 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
   p <- rep(1.1, N)
   # calculate p-values
   for(i in seq_len(N)) {
-    p[i] = c(sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1)
+    p[i] = (sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1)
   }
   names(p) <- names(T_stat)
   
   # generate ECDF and output
-  list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat)
+  # list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat)
+  list(p = p, sim_T = sim_T, T_stat = T_stat)
 }
 
 

From 95f52ae3d07bea38e554d3c469cc9fe57395f2ac Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 29 Jun 2018 17:06:20 +0800
Subject: [PATCH 08/16] multi sample version made. Little speed benefits have
 been made, so may discard the previous few changes.

Former-commit-id: 01d9c556f5a0c36b6c70a303ee15d8bd2ffddf64
---
 R/private_functions.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/R/private_functions.R b/R/private_functions.R
index b888ca6..dc00a0d 100644
--- a/R/private_functions.R
+++ b/R/private_functions.R
@@ -789,6 +789,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
         "sort_sim_qm", 
         "N", "M", "mu_vec", "se_vec", 
         "simulate_quant_statistic",
+        "simulate_quant_statistic_sampp",
         "simulate_quantile_matrix", 
         "quant_statistic",
         "trim_vector",
@@ -798,7 +799,9 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
       envir = environment()
     )
     # Run replicate
-    sim_T <- parReplicate(cluster, B, simulate_quant_statistic())
+    sim_T <- as.vector(
+            parReplicate(cluster, B, simulate_quant_statistic_sampp())
+    )
     if(cluster_stop) {
       stopCluster(cluster)
     }

From 6bc183b3236e50a21f206f4efa5b064374573766 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Sat, 7 Jul 2018 18:36:33 +0800
Subject: [PATCH 09/16] tsum_test() now includes a standard error estimate of
 the p-value generated

Former-commit-id: d30bb577c72bc07285d95abce7f182e1d799fbd8
---
 R/tsum_test.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/tsum_test.R b/R/tsum_test.R
index 7b98660..8277025 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -164,11 +164,15 @@ tsum_test <- function(strscore,
     pvals <- NULL
   }
   # Prepare output
-  exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals, 
+  outtsum <- exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals, 
     qmats = sim.results$qmmats, xecs = sim.results$xecs,
     correction = correction,
     alpha = alpha, 
     args = list(trim = trim, min.quant = min.quant, B = B))
+  if(give.pvalue) {
+    outtsum$stats[, p.value.sd := sqrt(p.value * (1 - p.value) / (B * strscore$samples[, .N]))]
+  }
+  outtsum
 }
 
 # test code:

From 0a5a4e79c5a2b7bada8978a3d29b2d62d2e357b4 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Sun, 8 Jul 2018 18:37:52 +0800
Subject: [PATCH 10/16] Correction for SE(p.value) so that it does not equal 0
 when p.value = 1

Former-commit-id: e37cb104d22c77ede5317e3bbad4eeda6347a6b9
---
 R/tsum_test.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/R/tsum_test.R b/R/tsum_test.R
index 8277025..d4520bd 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -170,7 +170,10 @@ tsum_test <- function(strscore,
     alpha = alpha, 
     args = list(trim = trim, min.quant = min.quant, B = B))
   if(give.pvalue) {
-    outtsum$stats[, p.value.sd := sqrt(p.value * (1 - p.value) / (B * strscore$samples[, .N]))]
+    Nsim <- B * strscore$samples[, .N]
+    outtsum$stats[, p.value.sd := 
+        sqrt(p.value * ((Nsim + 2)/(Nsim + 1) - p.value) / Nsim)
+      ]
   }
   outtsum
 }

From 1186e123f4c030a94e9eefe8bc6752527faa0dfe Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Mon, 9 Jul 2018 17:18:31 +0800
Subject: [PATCH 11/16] Documentation improvement, and p_values() now describes
 correction options better and added a number of "loci" BF correction method.

Former-commit-id: a298684c52751b90776dfe84cb8c00d4a005558b
---
 R/p_values.R                     |  9 +++++++--
 R/tsum_test.R                    |  4 ++--
 examples/exSTRa_score_analysis.R | 11 +++++++----
 man/p_values.Rd                  |  3 ++-
 man/tsum_test.Rd                 |  2 +-
 vignettes/exSTRa.Rmd             | 13 +++++++++----
 6 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/R/p_values.R b/R/p_values.R
index a63f425..d8ce31a 100644
--- a/R/p_values.R
+++ b/R/p_values.R
@@ -5,7 +5,8 @@
 #' @param tsum An exstra_tsum object. 
 #' @param correction Correction method to use. Use "bf" or TRUE for Bonferroni correction, and 
 #'                   "uncorrected" or FALSE for no correction. ("bonferroni" is also acceptable).
-#'                   "locus" is Bonferroni correction by locus.
+#'                   "samples" is Bonferroni correction by the number of tests (samples) at each locus.
+#'                   "loci" is Bonferroni correction by the number of loci.
 #'                   
 #' @param alpha Significance level alpha.
 #' @param only.sig If TRUE, only return significant results.
@@ -60,10 +61,14 @@ p_values <- function(
     out.table[, signif := p.value <= alpha / n_tests ]
   } else if ((is.logical(correction[1]) && ! correction[1]) || correction[1] == "uncorrected") {
     out.table[, signif := p.value <= alpha ]
-  } else if (correction[1] == "locus") {
+  } else if (correction[1] == "samples") {
     out.table[!is.na(p.value), N := .N, by = locus]
     out.table[, signif := p.value <= alpha / N ]
     out.table[, N := NULL]
+  } else if (correction[1] == "loci") {
+    out.table[!is.na(p.value), N := .N, by = sample]
+    out.table[, signif := p.value <= alpha / N ]
+    out.table[, N := NULL]
   } else {
     stop("Unknown correction method ", correction[1])
   }
diff --git a/R/tsum_test.R b/R/tsum_test.R
index d4520bd..8efd467 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -62,8 +62,8 @@ tsum_test <- function(strscore,
   trim.cc = 0,
   min.quant = 0.5,
   give.pvalue = TRUE, 
-  B = 9999, 
-  correction = c("bf", "locus", "uncorrected"),
+  B = 999, 
+  correction = c("bf", "loci", "samples", "uncorrected"),
   alpha = 0.05,
   case_control = FALSE, 
   parallel = FALSE, # TRUE for cluster
diff --git a/examples/exSTRa_score_analysis.R b/examples/exSTRa_score_analysis.R
index ca442f9..00feacc 100644
--- a/examples/exSTRa_score_analysis.R
+++ b/examples/exSTRa_score_analysis.R
@@ -60,16 +60,19 @@ plot_cols
 
 par(mfrow = c(2, 2))
 # Bonferroni correction is too severe here, so we use Bonferroni correction only on each 
-# locus.
-plot(tsum, sample_col = plot_cols, correction = "locus")
+# locus for the number of samples.
+plot(tsum, sample_col = plot_cols, correction = "samples")
+
+# Or Bonferroni correction for the number of loci tested:
+plot(tsum, sample_col = plot_cols, correction = "loci")
 
 # Give a table of each sample and locus with the p-value, and if it is significant:
-(ps <- p_values(tsum, correction = "locus"))
+(ps <- p_values(tsum, correction = "samples"))
 
 # this may be acted on directly: 
 ps[identity(signif)]
 # or with the only.signif option:
-p_values(tsum, only.signif = TRUE, correction = "locus")
+p_values(tsum, only.signif = TRUE, correction = "samples")
 
 # Give the best hit(s) for each sample:
 # TODO: what is best for display may not be the best for internal representation. 
diff --git a/man/p_values.Rd b/man/p_values.Rd
index 8ecbc7c..f2db601 100644
--- a/man/p_values.Rd
+++ b/man/p_values.Rd
@@ -12,7 +12,8 @@ p_values(tsum, correction = c("bf", "locus", "uncorrected"), alpha = 0.05,
 
 \item{correction}{Correction method to use. Use "bf" or TRUE for Bonferroni correction, and 
 "uncorrected" or FALSE for no correction. ("bonferroni" is also acceptable).
-"locus" is Bonferroni correction by locus.}
+"samples" is Bonferroni correction by the number of tests (samples) at each locus.
+"loci" is Bonferroni correction by the number of loci.}
 
 \item{alpha}{Significance level alpha.}
 
diff --git a/man/tsum_test.Rd b/man/tsum_test.Rd
index cbb2c7a..eb35619 100644
--- a/man/tsum_test.Rd
+++ b/man/tsum_test.Rd
@@ -5,7 +5,7 @@
 \title{Generate T sum statistics and p-values from simulation.}
 \usage{
 tsum_test(strscore, trim = 0.15, trim.cc = 0, min.quant = 0.5,
-  give.pvalue = TRUE, B = 9999, correction = c("bf", "locus",
+  give.pvalue = TRUE, B = 999, correction = c("bf", "loci", "samples",
   "uncorrected"), alpha = 0.05, case_control = FALSE, parallel = FALSE,
   cluster_n = NULL, cluster = NULL)
 }
diff --git a/vignettes/exSTRa.Rmd b/vignettes/exSTRa.Rmd
index a7113c4..c9a0645 100644
--- a/vignettes/exSTRa.Rmd
+++ b/vignettes/exSTRa.Rmd
@@ -211,18 +211,23 @@ with the default number of simulations (9999).
 This can be adjusted with the `B` parameter of `tsum_test()`, or a less stringent
 threshold can be used.
 Bonferroni correction is too severe here, so we can specify Bonferroni correction only 
-on each locus.
+on each locus for the number of samples tested.
 ```{r, out.width = '82%', fig.width=12, fig.height=12}
 par(mfrow = c(2, 2))
-plot(tsum, sample_col = plot_cols, correction = "locus")
+plot(tsum, sample_col = plot_cols, correction = "samples")
 ```
 
+Or Bonferroni correction may be applied for the number of loci tested.
+```{r, out.width = '82%', fig.width=12, fig.height=12}
+par(mfrow = c(2, 2))
+plot(tsum, sample_col = plot_cols, correction = "loci")
+```
 
 You may obtain a data.table of each sample and locus with the p-value, and if it is significant
 with the correction method applied.
 Here, the correction method is Bonferroni per locus.
 ```{r}
-(ps <- p_values(tsum, correction = "locus"))
+(ps <- p_values(tsum, correction = "samples"))
 ```
 
 To obtain only the significant samples, you can either use data.table subsetting:
@@ -232,7 +237,7 @@ ps[identity(signif)]
 
 or when retrieving the data.table from p_values():
 ```{r}
-p_values(tsum, only.signif = TRUE, correction = "locus")
+p_values(tsum, only.signif = TRUE, correction = "samples")
 ```
 
 

From ca6b9390986e37efb5b239af14691dca5d306595 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 13 Jul 2018 14:53:32 +0800
Subject: [PATCH 12/16] tsum_test() removes simulation results by default now.
 Updated documentation for p_values(), tsum_test().

Former-commit-id: 5ce2d8dfac1871859f3b1f328ed0864dba047f32
---
 DESCRIPTION      |  4 ++--
 R/p_values.R     |  6 +++++-
 R/tsum_test.R    | 11 ++++++++++-
 man/p_values.Rd  |  6 +++++-
 man/tsum_test.Rd |  5 ++++-
 5 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 7fbb0c7..a14f226 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: exSTRa
 Type: Package
 Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
-Version: 0.85
-Date: 2018-06-06
+Version: 0.85.1
+Date: 2018-07-1x
 Author: Rick Tankard
 Maintainer: Rick Tankard <tankard@wehi.edu.au>
 Description: Detecting expansions with paired-end Illumina sequencing data.
diff --git a/R/p_values.R b/R/p_values.R
index d8ce31a..1807264 100644
--- a/R/p_values.R
+++ b/R/p_values.R
@@ -12,7 +12,11 @@
 #' @param only.sig If TRUE, only return significant results.
 #' @param modify If TRUE, will modify the tsum$stats table. Effectively ignored if only.sig == TRUE.
 #' @param p.matrix Matrix of p-values for internal use. Should only be used without tsum. 
-#' @return A data.table
+#' @return A \code{data.table} keyed by "locus" then "sample". 
+#'         Other columns are \code{tsum} as calculated by \code{\link{tsum_test}}, \code{p.value} (uncorrected),
+#'         \code{signif} (TRUE if significant after given correction), and 
+#'         \code{p.value.sd}, giving the standard deviation of the p-value estimate from the 
+#'         simulation.
 #' 
 #' @import magrittr
 #' @import testit
diff --git a/R/tsum_test.R b/R/tsum_test.R
index 8efd467..059b436 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -30,6 +30,9 @@
 #'                 If cluster is specified then this option makes no difference.
 #' @param cluster  A cluster object from the parallel package. Use if you wish to set up 
 #'                 the cluster yourself or reuse an existing cluster. 
+#' @param keep.sim.tsum For inspection of simulations. 
+#'                      If TRUE, keep all simulation Tsum statistics in output$xecs (default FALSE).
+#'                      
 #' @return An exstra_tsum object with T statistics and p-values (if calculated).
 #' 
 #' @seealso \code{\link{tsum_p_value_summary}}
@@ -68,8 +71,9 @@ tsum_test <- function(strscore,
   case_control = FALSE, 
   parallel = FALSE, # TRUE for cluster
   cluster_n = NULL, # Cluster size if cluster == NULL. When NULL, #threads - 1 (but always at least 1)
-  cluster = NULL # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a
+  cluster = NULL, # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a
                   # PSOCK cluster is automatically created with the parallel package.
+  keep.sim.tsum = FALSE
   ) 
 {
   # Check inputs
@@ -175,6 +179,11 @@ tsum_test <- function(strscore,
         sqrt(p.value * ((Nsim + 2)/(Nsim + 1) - p.value) / Nsim)
       ]
   }
+  if(! keep.sim.tsum) {
+    for(i in seq_along(tsum$xecs)) {
+      tsum$xecs[[i]]$sim_T <- NULL
+    }
+  }
   outtsum
 }
 
diff --git a/man/p_values.Rd b/man/p_values.Rd
index f2db601..e55347b 100644
--- a/man/p_values.Rd
+++ b/man/p_values.Rd
@@ -24,7 +24,11 @@ p_values(tsum, correction = c("bf", "locus", "uncorrected"), alpha = 0.05,
 \item{only.sig}{If TRUE, only return significant results.}
 }
 \value{
-A data.table
+A \code{data.table} keyed by "locus" then "sample". 
+        Other columns are \code{tsum} as calculated by \code{\link{tsum_test}}, \code{p.value} (uncorrected),
+        \code{signif} (TRUE if significant after given correction), and 
+        \code{p.value.sd}, giving the standard deviation of the p-value estimate from the 
+        simulation.
 }
 \description{
 Return a data.table with p-values of a tsum_exstra object.
diff --git a/man/tsum_test.Rd b/man/tsum_test.Rd
index eb35619..07eb5c9 100644
--- a/man/tsum_test.Rd
+++ b/man/tsum_test.Rd
@@ -7,7 +7,7 @@
 tsum_test(strscore, trim = 0.15, trim.cc = 0, min.quant = 0.5,
   give.pvalue = TRUE, B = 999, correction = c("bf", "loci", "samples",
   "uncorrected"), alpha = 0.05, case_control = FALSE, parallel = FALSE,
-  cluster_n = NULL, cluster = NULL)
+  cluster_n = NULL, cluster = NULL, keep.sim.tsum = FALSE)
 }
 \arguments{
 \item{strscore}{An exstra_score object.}
@@ -47,6 +47,9 @@ If cluster is specified then this option makes no difference.}
 
 \item{cluster}{A cluster object from the parallel package. Use if you wish to set up 
 the cluster yourself or reuse an existing cluster.}
+
+\item{keep.sim.tsum}{For inspection of simulations. 
+If TRUE, keep all simulation Tsum statistics in output$xecs (default FALSE).}
 }
 \value{
 An exstra_tsum object with T statistics and p-values (if calculated).

From cd38895a7dcdbce416ff3f6490eaf5a93cdffe26 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 13 Jul 2018 15:09:32 +0800
Subject: [PATCH 13/16] bug fix, changing incorrect object

Former-commit-id: a6d66af023672105337adef94db51063d8a62c76
---
 R/tsum_test.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/tsum_test.R b/R/tsum_test.R
index 059b436..7abfd9d 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -181,7 +181,7 @@ tsum_test <- function(strscore,
   }
   if(! keep.sim.tsum) {
     for(i in seq_along(tsum$xecs)) {
-      tsum$xecs[[i]]$sim_T <- NULL
+      outtsum$xecs[[i]]$sim_T <- NULL
     }
   }
   outtsum

From ca8b8bf4a463175cb9eaf4b77b72c651e2273fb7 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 13 Jul 2018 15:26:28 +0800
Subject: [PATCH 14/16] bug fix in tsum_test(), incorrect object name

Former-commit-id: 7d39a924369dd855fedeafb26d1b213f0b7f4964
---
 R/tsum_test.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/tsum_test.R b/R/tsum_test.R
index 7abfd9d..2c59d07 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -180,7 +180,7 @@ tsum_test <- function(strscore,
       ]
   }
   if(! keep.sim.tsum) {
-    for(i in seq_along(tsum$xecs)) {
+    for(i in seq_along(outtsum$xecs)) {
       outtsum$xecs[[i]]$sim_T <- NULL
     }
   }

From fb5cca318fbd8f7df014e22d44609df555a5fba0 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 13 Jul 2018 15:53:48 +0800
Subject: [PATCH 15/16] Updated version number for release

Former-commit-id: b59c67bf6cdec81149b2b37a5cab0ad0c6791e25
---
 DESCRIPTION | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a14f226..6fa6611 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: exSTRa
 Type: Package
 Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
-Version: 0.85.1
-Date: 2018-07-1x
+Version: 0.86
+Date: 2018-07-13
 Author: Rick Tankard
 Maintainer: Rick Tankard <tankard@wehi.edu.au>
 Description: Detecting expansions with paired-end Illumina sequencing data.

From 7448c22642374d1cf18914854a484e4946c88c34 Mon Sep 17 00:00:00 2001
From: Rick Tankard <tankard@wehi.edu.au>
Date: Fri, 13 Jul 2018 16:13:36 +0800
Subject: [PATCH 16/16] Had to add stop message to p-value calculation (not yet
 implemented)

Former-commit-id: ff2e90fe8dc7bc697757545634e8d50ea552f3c7
---
 R/tsum_test.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/R/tsum_test.R b/R/tsum_test.R
index 2c59d07..ac2c669 100644
--- a/R/tsum_test.R
+++ b/R/tsum_test.R
@@ -88,6 +88,11 @@ tsum_test <- function(strscore,
     is.null(cluster) || inherits(cluster, "cluster"))
   assert("cluster_n should be at least 1 and a whole-number.", is.null(cluster_n) || cluster_n >= 1)
   
+  # temp code until this is developed
+  if(case_control && give.pvalue) {
+    stop("tsum_test() cannot yet use cases and controls to generate p-values from simulation.") 
+  }
+  
   if(parallel) {
     n_cores <- detectCores(all.tests = FALSE, logical = TRUE)
     if(is.null(cluster_n)) {