Merge pull request #35 from bahlolab/devel_tankard

Improvements in tsum_test() and p_values(). Former-commit-id: 3ffd432
bahlolab · Jul 13, 2018 · 47b45cb · 47b45cb
2 parents 7181615 + 7448c22
commit 47b45cb
Show file tree

Hide file tree

Showing 9 changed files with 152 additions and 46 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: exSTRa
 Type: Package
 Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
-Version: 0.85
-Date: 2018-06-06
+Version: 0.86
+Date: 2018-07-13
 Author: Rick Tankard
 Maintainer: Rick Tankard <[email protected]>
 Description: Detecting expansions with paired-end Illumina sequencing data.

diff --git a/R/CLASS_exstra_score.R b/R/CLASS_exstra_score.R
@@ -39,7 +39,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL
     }
   }
   if(!is.null(groups.samples)) {
-    # using regex for groups
+    # Specify the group of samples directly via groups.samples
     assert("groups.samples must be a list if used, with vectors with names of at least one of 'case', 'control' or 'null'.", 
       is.list(groups.samples), 
       length(groups.samples) > 0,
@@ -55,7 +55,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL
         groups_all[sample == counts$sample] <- "case"
       }      
     } else {
-      groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.regex))
+      groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.samples))
       for(group.name in names(groups.samples)) {
         for(sample in groups.samples[[group.name]]) {
           groups_all[sample == counts$sample] <- group.name
@@ -341,4 +341,4 @@ as.exstra_score <- function(x, copy = FALSE) {
     x <- copy.exstra_score(x)
   }
   structure(list(data = x$data, db = x$db, input_type = x$input_type, samples = x$samples), class = c("exstra_score", "exstra_db"))
-}
+}
diff --git a/R/p_values.R b/R/p_values.R
@@ -5,13 +5,18 @@
 #' @param tsum An exstra_tsum object. 
 #' @param correction Correction method to use. Use "bf" or TRUE for Bonferroni correction, and 
 #'                   "uncorrected" or FALSE for no correction. ("bonferroni" is also acceptable).
-#'                   "locus" is Bonferroni correction by locus.
+#'                   "samples" is Bonferroni correction by the number of tests (samples) at each locus.
+#'                   "loci" is Bonferroni correction by the number of loci.
 #'                   
 #' @param alpha Significance level alpha.
 #' @param only.sig If TRUE, only return significant results.
 #' @param modify If TRUE, will modify the tsum$stats table. Effectively ignored if only.sig == TRUE.
 #' @param p.matrix Matrix of p-values for internal use. Should only be used without tsum. 
-#' @return A data.table
+#' @return A \code{data.table} keyed by "locus" then "sample". 
+#'         Other columns are \code{tsum} as calculated by \code{\link{tsum_test}}, \code{p.value} (uncorrected),
+#'         \code{signif} (TRUE if significant after given correction), and 
+#'         \code{p.value.sd}, giving the standard deviation of the p-value estimate from the 
+#'         simulation.
 #' 
 #' @import magrittr
 #' @import testit
@@ -60,10 +65,14 @@ p_values <- function(
     out.table[, signif := p.value <= alpha / n_tests ]
   } else if ((is.logical(correction[1]) && ! correction[1]) || correction[1] == "uncorrected") {
     out.table[, signif := p.value <= alpha ]
-  } else if (correction[1] == "locus") {
+  } else if (correction[1] == "samples") {
     out.table[!is.na(p.value), N := .N, by = locus]
     out.table[, signif := p.value <= alpha / N ]
     out.table[, N := NULL]
+  } else if (correction[1] == "loci") {
+    out.table[!is.na(p.value), N := .N, by = sample]
+    out.table[, signif := p.value <= alpha / N ]
+    out.table[, N := NULL]
   } else {
     stop("Unknown correction method ", correction[1])
   }

diff --git a/R/private_functions.R b/R/private_functions.R
@@ -558,11 +558,15 @@ quant_statistic <- function(qmmat, sample = 1, quant = 0.5, trim = 0.15,
   cumsum(rev(ts))[qs] / qs # we divide by the number of t statistics being summed
 }
 
-quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) {
+quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, 
+  case_samples = NULL,
+  ...) {
   # get the quantile statistic for multiple samples
   # qmmat: a quantile matrix from the make_quantiles_matrix() function
   # sample: samples to get the statistic of. If NULL, give all samples in qmmat
   # qs: keeps the top number of quantiles, you probably do not want to use this
+  # case_samples: if not NULL, only calculate for these samples in case-control setting.
+  #               Other cases are excluded in each calculation.
   # ... further arguments to quant_statistic(), most interesting is:
   #     quant keeps quantiles above its value only when qs is not specified. The default 
   #         keeps all values above the median (quant = 0.5)
@@ -575,16 +579,35 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) {
   assert("sample is not a character, numeric or null", is.null(sample) || is.character(sample) || is.numeric(sample))
   assert("qs is not numeric", is.null(qs) || is.numeric(qs))
   assert("qs is not single", is.null(qs) || length(qs) == 1)
-  if(is.null(sample)) {
-    sample <- seq_len(dim(qmmat)[1])
-  }
-  t_out <- rep(NA, length(sample))
+
   ti <- 0
-  for(samp in sample) {
-    ti <- ti + 1
-    t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...)
+  if(!is.null(case_samples)) {
+    if(is.null(sample)) {
+      sample <- case_samples
+    }
+    assert("All case samples should be in qmmat", all(case_samples %in% rownames(qmmat)))
+    t_out <- rep(NA, length(sample) - length(case_samples))
+    control_samples <- rownames(qmmat)[! rownames(qmmat) %in% case_samples]
+    for(samp in sample) {
+      ti <- ti + 1
+      # Get only the correct qmmat columns
+      qmmat0 <- qmmat[c(samp, control_samples),]
+      t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs, 
+        subject_in_background = FALSE, ...) # TODO: maybe trim = 0?
+    }
+    names(t_out) <- sample
+  } else {
+    # No samples marked explicitly as cases
+    if(is.null(sample)) {
+      sample <- seq_len(dim(qmmat)[1])
+    }
+    t_out <- rep(NA, length(sample))
+    for(samp in sample) {
+      ti <- ti + 1
+      t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...)
+    }
+    names(t_out) <- rownames(qmmat) # may be a bug if only some samples are required
   }
-  names(t_out) <- rownames(qmmat)
   t_out
 }
 
@@ -643,6 +666,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
   parallel = FALSE, # perform in parallel
   cluster = NULL, # cluster for parallel. If NULL, then make one
   cluster_n = 4, # cluster size if cl is NULL
+  T_stat = NULL, # T_stat named vector precomputed
   ...) {
   # Derive a simulated ECDF, and return an "ecdf" class object
   # qmmat: a quantile matrix from the make_quantiles_matrix() function
@@ -729,6 +753,10 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
     simu <- simulate_quantile_matrix()
     do.call(quant_statistic, c(list(simu), triple_dots))
   }
+  simulate_quant_statistic_sampp <- function() {
+    simu <- simulate_quantile_matrix()
+    do.call(quant_statistic_sampp, c(list(simu), triple_dots))
+  }
 
   triple_dots <- c(list(...), list(subject_in_background = subject_in_background, 
     use_truncated_sd = use_truncated_sd_in_quant_statistic, 
@@ -761,6 +789,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
         "sort_sim_qm", 
         "N", "M", "mu_vec", "se_vec", 
         "simulate_quant_statistic",
+        "simulate_quant_statistic_sampp",
         "simulate_quantile_matrix", 
         "quant_statistic",
         "trim_vector",
@@ -770,34 +799,41 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
       envir = environment()
     )
     # Run replicate
-    sim_T <- parReplicate(cluster, B, simulate_quant_statistic())
+    sim_T <- as.vector(
+            parReplicate(cluster, B, simulate_quant_statistic_sampp())
+    )
     if(cluster_stop) {
       stopCluster(cluster)
     }
   } else {
     # not performing in parallel
-    sim_T <- rep(0, B) # Don't let p-value get to zero
+    # TODO here:
+    sim_T <- rep(0, B*N) # Don't let p-value get to zero
     for(i in seq_len(B)) {
-      sim_T[i] <- simulate_quant_statistic()
+      sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic_sampp()
+      # TODO: insert stopping-code here
     }
   }
 
 
-
-  comp_p_function <- ecdf(sim_T)
+  # Following is not relevant to output
+  # comp_p_function <- ecdf(sim_T)
 
   # p-values
-  T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ...
+  if(is.null(T_stat)) {
+    T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ...
+  }
   # p <- 1 - comp_p_function(T_stat) + 1 / (B + 1)
   p <- rep(1.1, N)
   # calculate p-values
   for(i in seq_len(N)) {
-    p[i] = c(sum(sim_T > T_stat[i]) + 1) / (B + 1)
+    p[i] = (sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1)
   }
   names(p) <- names(T_stat)
 
   # generate ECDF and output
-  list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat)
+  # list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat)
+  list(p = p, sim_T = sim_T, T_stat = T_stat)
 }
 
 
@@ -989,7 +1025,8 @@ sd_of_trimmed <- function(...) {
 # This runs the simulation with the given paramets
 # Use ... to pass options to simulate_ecdf_quant_statistic()
 # This also passes remaining options onto quant_statistic() and quant_statistic_sampp()
-simulation_run <- function(data, B = 99, trim = 0.15, ...) {
+simulation_run <- function(data, B = 99, trim = 0.15,
+    T_stats = NULL, ...) {
   N <- data$samples[, .N]
   L <- data$db[, .N]
   p.matrix <- matrix(nrow = N, ncol = L)
@@ -1001,7 +1038,11 @@ simulation_run <- function(data, B = 99, trim = 0.15, ...) {
     message("Simulating distribution for ", loc)
     qm_loop <- make_quantiles_matrix(data, loc, read_count_quant = 1, 
       method = "quantile7", min.n = 3)
-    xec <- simulate_ecdf_quant_statistic(qm_loop, B, ...)
+    T_stat_loc <- T_stats[locus == loc, ]
+    T_stat <- T_stat_loc[, tsum]
+    names(T_stat) <- T_stat_loc[, sample]
+    xec <- simulate_ecdf_quant_statistic(qm_loop, B, 
+      T_stat = T_stat, ...)
     p.matrix[names(xec$p), loc] <- xec$p
     qmmats[[loc]] <- qm_loop
     xecs[[loc]] <- xec

diff --git a/R/tsum_test.R b/R/tsum_test.R
@@ -8,6 +8,7 @@
 #' @param trim Trim this proportion of data points at each quantile 
 #'             level (rounded up). Must be at least 0 and less than 0.5, but values close
 #'             to 0.5 may remove all samples and hence result in an error.  
+#' @param trim.cc Trim value used in case-control analysis. Default of 0.
 #' @param min.quant Only quantiles above this value are used in constructing the statistic.  
 #' @param give.pvalue Whether to calculate the p-value. As this can be slow it can be
 #'                    useful to turn off if only the t sum statistics are required. 
@@ -16,6 +17,8 @@
 #'          decimal fractions. 
 #' @param correction Correction method of p_value() function.
 #' @param alpha Signficance level of p_value() function.
+#' @param case_control If TRUE, only calculate for samples designated cases. Otherwise
+#'                 all samples are used to calculate the background distribution.
 #' @param parallel Use the parallel package when simulating the distribution, creating the
 #'                 required cluster. 
 #'                 If cluster is specified then this option makes no difference. 
@@ -27,6 +30,9 @@
 #'                 If cluster is specified then this option makes no difference.
 #' @param cluster  A cluster object from the parallel package. Use if you wish to set up 
 #'                 the cluster yourself or reuse an existing cluster. 
+#' @param keep.sim.tsum For inspection of simulations. 
+#'                      If TRUE, keep all simulation Tsum statistics in output$xecs (default FALSE).
+#'                      
 #' @return An exstra_tsum object with T statistics and p-values (if calculated).
 #' 
 #' @seealso \code{\link{tsum_p_value_summary}}
@@ -56,15 +62,18 @@
 #' @export
 tsum_test <- function(strscore, 
   trim = 0.15,
+  trim.cc = 0,
   min.quant = 0.5,
   give.pvalue = TRUE, 
-  B = 9999, 
-  correction = c("bf", "locus", "uncorrected"),
+  B = 999, 
+  correction = c("bf", "loci", "samples", "uncorrected"),
   alpha = 0.05,
+  case_control = FALSE, 
   parallel = FALSE, # TRUE for cluster
   cluster_n = NULL, # Cluster size if cluster == NULL. When NULL, #threads - 1 (but always at least 1)
-  cluster = NULL # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a
+  cluster = NULL, # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a
                   # PSOCK cluster is automatically created with the parallel package.
+  keep.sim.tsum = FALSE
   ) 
 {
   # Check inputs
@@ -79,6 +88,11 @@ tsum_test <- function(strscore,
     is.null(cluster) || inherits(cluster, "cluster"))
   assert("cluster_n should be at least 1 and a whole-number.", is.null(cluster_n) || cluster_n >= 1)
 
+  # temp code until this is developed
+  if(case_control && give.pvalue) {
+    stop("tsum_test() cannot yet use cases and controls to generate p-values from simulation.") 
+  }
+
   if(parallel) {
     n_cores <- detectCores(all.tests = FALSE, logical = TRUE)
     if(is.null(cluster_n)) {
@@ -105,7 +119,14 @@ tsum_test <- function(strscore,
     # message("Generating T sum statistics for ", loc)
     qm <- make_quantiles_matrix(strscore, loc = loc, sample = NULL, read_count_quant = 1, 
       method = "quantile7", min.n = 3)
-    T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5
+    if(case_control) {
+      # Only calculating for case samples
+      T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim.cc,
+        case_samples = strscore$samples[group == "case", sample]) 
+    } else {
+      # Using all samples as the background:
+      T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5
+    }
     T_stats_list[[loc]] <- data.table(sample = names(T_stats_loc), tsum = T_stats_loc)
   }
   T_stats <- rbindlist(T_stats_list, idcol = "locus")
@@ -132,7 +153,8 @@ tsum_test <- function(strscore,
         parallel = parallel, # TRUE for cluster
         cluster = cluster,
         cluster_n = cluster_n,
-        trim = trim
+        trim = trim,
+        T_stats = T_stats
       ),
       error = function(x) { 
           # list(cohort = this.cohort, p.matrix = c(0, 1), error = x)
@@ -151,11 +173,23 @@ tsum_test <- function(strscore,
     pvals <- NULL
   }
   # Prepare output
-  exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals, 
+  outtsum <- exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals, 
     qmats = sim.results$qmmats, xecs = sim.results$xecs,
     correction = correction,
     alpha = alpha, 
     args = list(trim = trim, min.quant = min.quant, B = B))
+  if(give.pvalue) {
+    Nsim <- B * strscore$samples[, .N]
+    outtsum$stats[, p.value.sd := 
+        sqrt(p.value * ((Nsim + 2)/(Nsim + 1) - p.value) / Nsim)
+      ]
+  }
+  if(! keep.sim.tsum) {
+    for(i in seq_along(outtsum$xecs)) {
+      outtsum$xecs[[i]]$sim_T <- NULL
+    }
+  }
+  outtsum
 }
 
 # test code:

diff --git a/examples/exSTRa_score_analysis.R b/examples/exSTRa_score_analysis.R
@@ -60,16 +60,19 @@ plot_cols
 
 par(mfrow = c(2, 2))
 # Bonferroni correction is too severe here, so we use Bonferroni correction only on each 
-# locus.
-plot(tsum, sample_col = plot_cols, correction = "locus")
+# locus for the number of samples.
+plot(tsum, sample_col = plot_cols, correction = "samples")
+
+# Or Bonferroni correction for the number of loci tested:
+plot(tsum, sample_col = plot_cols, correction = "loci")
 
 # Give a table of each sample and locus with the p-value, and if it is significant:
-(ps <- p_values(tsum, correction = "locus"))
+(ps <- p_values(tsum, correction = "samples"))
 
 # this may be acted on directly: 
 ps[identity(signif)]
 # or with the only.signif option:
-p_values(tsum, only.signif = TRUE, correction = "locus")
+p_values(tsum, only.signif = TRUE, correction = "samples")
 
 # Give the best hit(s) for each sample:
 # TODO: what is best for display may not be the best for internal representation. 

diff --git a/man/p_values.Rd b/man/p_values.Rd