From e4a829d5c27e7dd340aee74bdc2b511597ba42e5 Mon Sep 17 00:00:00 2001 From: a113n Date: Mon, 11 Jun 2018 00:55:46 +0100 Subject: [PATCH 01/16] Update CLASS_exstra_score.R Fix the bug of reading groups.samples Former-commit-id: 7837a90f28523001fe14a9adf01f589e08ab81e3 --- R/CLASS_exstra_score.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/CLASS_exstra_score.R b/R/CLASS_exstra_score.R index bf9b975..2fc4f1e 100644 --- a/R/CLASS_exstra_score.R +++ b/R/CLASS_exstra_score.R @@ -39,7 +39,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL } } if(!is.null(groups.samples)) { - # using regex for groups + # Specify the group of samples directly via groups.samples assert("groups.samples must be a list if used, with vectors with names of at least one of 'case', 'control' or 'null'.", is.list(groups.samples), length(groups.samples) > 0, @@ -55,7 +55,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL groups_all[sample == counts$sample] <- "case" } } else { - groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.regex)) + groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.samples)) for(group.name in names(groups.samples)) { for(sample in groups.samples[[group.name]]) { groups_all[sample == counts$sample] <- group.name @@ -341,4 +341,4 @@ as.exstra_score <- function(x, copy = FALSE) { x <- copy.exstra_score(x) } structure(list(data = x$data, db = x$db, input_type = x$input_type, samples = x$samples), class = c("exstra_score", "exstra_db")) -} \ No newline at end of file +} From 3c783dda4d726fdb8bbde64d8d3954f3cae21d2b Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 22 Jun 2018 16:01:05 +0800 Subject: [PATCH 02/16] private function quant_statistic_sampp() now allows specific case samples to be given Former-commit-id: 13fcc9ea8a20e9b332c4b57825f868ca671325bb --- R/private_functions.R | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/R/private_functions.R b/R/private_functions.R index d275a42..8b5fc2a 100644 --- a/R/private_functions.R +++ b/R/private_functions.R @@ -558,11 +558,15 @@ quant_statistic <- function(qmmat, sample = 1, quant = 0.5, trim = 0.15, cumsum(rev(ts))[qs] / qs # we divide by the number of t statistics being summed } -quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) { +quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, + case_samples = NULL, + ...) { # get the quantile statistic for multiple samples # qmmat: a quantile matrix from the make_quantiles_matrix() function # sample: samples to get the statistic of. If NULL, give all samples in qmmat # qs: keeps the top number of quantiles, you probably do not want to use this + # case_samples: if not NULL, only calculate for these samples in case-control setting. + # Other cases are excluded in each calculation. # ... further arguments to quant_statistic(), most interesting is: # quant keeps quantiles above its value only when qs is not specified. The default # keeps all values above the median (quant = 0.5) @@ -575,16 +579,33 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) { assert("sample is not a character, numeric or null", is.null(sample) || is.character(sample) || is.numeric(sample)) assert("qs is not numeric", is.null(qs) || is.numeric(qs)) assert("qs is not single", is.null(qs) || length(qs) == 1) - if(is.null(sample)) { - sample <- seq_len(dim(qmmat)[1]) - } - t_out <- rep(NA, length(sample)) + ti <- 0 - for(samp in sample) { - ti <- ti + 1 - t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...) + if(!is.null(case_samples)) { + if(is.null(sample)) { + sample <- case_samples + } + assert("All case samples should be in qmmat", all(case_samples %in% rownames(qmmat))) + t_out <- rep(NA, length(sample) - length(case_samples)) + for(samp in sample) { + ti <- ti + 1 + #TODO: get only the correct qmmat columns + qmmat0 <- qmmat + t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs, + subject_in_background = FALSE, ...) + } + } else { + # No samples marked explicitly as cases + if(is.null(sample)) { + sample <- seq_len(dim(qmmat)[1]) + } + t_out <- rep(NA, length(sample)) + for(samp in sample) { + ti <- ti + 1 + t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...) + } } - names(t_out) <- rownames(qmmat) + names(t_out) <- sample t_out } From 152afb12361454c330a9f610512013c2ce333855 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 22 Jun 2018 16:27:30 +0800 Subject: [PATCH 03/16] had to fix a bug I made with the old usage Former-commit-id: 1adaf093ee4eee10b71dac1873e76e09be9d4758 --- R/private_functions.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/private_functions.R b/R/private_functions.R index 8b5fc2a..220c8f4 100644 --- a/R/private_functions.R +++ b/R/private_functions.R @@ -592,8 +592,9 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, #TODO: get only the correct qmmat columns qmmat0 <- qmmat t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs, - subject_in_background = FALSE, ...) + subject_in_background = FALSE, ...) # TODO: maybe trim = 0? } + names(t_out) <- sample } else { # No samples marked explicitly as cases if(is.null(sample)) { @@ -604,8 +605,8 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ti <- ti + 1 t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...) } + names(t_out) <- rownames(qmmat) # may be a bug if only some samples are required } - names(t_out) <- sample t_out } From 054b6d081a749d7a8275b527426f31db0abff2fe Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 22 Jun 2018 17:07:41 +0800 Subject: [PATCH 04/16] case_control tsum_test now gives no trim. Excludes other samples properly now. Former-commit-id: fc79e96594a97b4c1526b2e23fb4044f5eeb18f6 --- R/private_functions.R | 5 +++-- R/tsum_test.R | 14 +++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/R/private_functions.R b/R/private_functions.R index 220c8f4..09908da 100644 --- a/R/private_functions.R +++ b/R/private_functions.R @@ -587,10 +587,11 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, } assert("All case samples should be in qmmat", all(case_samples %in% rownames(qmmat))) t_out <- rep(NA, length(sample) - length(case_samples)) + control_samples <- rownames(qmmat)[! rownames(qmmat) %in% case_samples] for(samp in sample) { ti <- ti + 1 - #TODO: get only the correct qmmat columns - qmmat0 <- qmmat + # Get only the correct qmmat columns + qmmat0 <- qmmat[c(samp, control_samples),] t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs, subject_in_background = FALSE, ...) # TODO: maybe trim = 0? } diff --git a/R/tsum_test.R b/R/tsum_test.R index 6930800..ea77d4f 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -8,6 +8,7 @@ #' @param trim Trim this proportion of data points at each quantile #' level (rounded up). Must be at least 0 and less than 0.5, but values close #' to 0.5 may remove all samples and hence result in an error. +#' @param trim.cc Trim value used in case-control analysis. Default of 0. #' @param min.quant Only quantiles above this value are used in constructing the statistic. #' @param give.pvalue Whether to calculate the p-value. As this can be slow it can be #' useful to turn off if only the t sum statistics are required. @@ -16,6 +17,8 @@ #' decimal fractions. #' @param correction Correction method of p_value() function. #' @param alpha Signficance level of p_value() function. +#' @param case_control If TRUE, only calculate for samples designated cases. Otherwise +#' all samples are used to calculate the background distribution. #' @param parallel Use the parallel package when simulating the distribution, creating the #' required cluster. #' If cluster is specified then this option makes no difference. @@ -56,11 +59,13 @@ #' @export tsum_test <- function(strscore, trim = 0.15, + trim.cc = 0, min.quant = 0.5, give.pvalue = TRUE, B = 9999, correction = c("bf", "locus", "uncorrected"), alpha = 0.05, + case_control = FALSE, parallel = FALSE, # TRUE for cluster cluster_n = NULL, # Cluster size if cluster == NULL. When NULL, #threads - 1 (but always at least 1) cluster = NULL # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a @@ -105,7 +110,14 @@ tsum_test <- function(strscore, # message("Generating T sum statistics for ", loc) qm <- make_quantiles_matrix(strscore, loc = loc, sample = NULL, read_count_quant = 1, method = "quantile7", min.n = 3) - T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5 + if(case_control) { + # Only calculating for case samples + T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim.cc, + case_samples = strscore$samples[group == "case", sample]) + } else { + # Using all samples as the background: + T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5 + } T_stats_list[[loc]] <- data.table(sample = names(T_stats_loc), tsum = T_stats_loc) } T_stats <- rbindlist(T_stats_list, idcol = "locus") From 15dbf6792cdce14e7c37aa900c9a2cf3c3fbfab5 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 22 Jun 2018 17:08:58 +0800 Subject: [PATCH 05/16] update docs for tsum_test() Former-commit-id: ac7f77c56e052412e1acc545b24f43057f602d88 --- man/tsum_test.Rd | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/man/tsum_test.Rd b/man/tsum_test.Rd index 325ffd3..cbb2c7a 100644 --- a/man/tsum_test.Rd +++ b/man/tsum_test.Rd @@ -4,9 +4,10 @@ \alias{tsum_test} \title{Generate T sum statistics and p-values from simulation.} \usage{ -tsum_test(strscore, trim = 0.15, min.quant = 0.5, give.pvalue = TRUE, - B = 9999, correction = c("bf", "locus", "uncorrected"), alpha = 0.05, - parallel = FALSE, cluster_n = NULL, cluster = NULL) +tsum_test(strscore, trim = 0.15, trim.cc = 0, min.quant = 0.5, + give.pvalue = TRUE, B = 9999, correction = c("bf", "locus", + "uncorrected"), alpha = 0.05, case_control = FALSE, parallel = FALSE, + cluster_n = NULL, cluster = NULL) } \arguments{ \item{strscore}{An exstra_score object.} @@ -15,6 +16,8 @@ tsum_test(strscore, trim = 0.15, min.quant = 0.5, give.pvalue = TRUE, level (rounded up). Must be at least 0 and less than 0.5, but values close to 0.5 may remove all samples and hence result in an error.} +\item{trim.cc}{Trim value used in case-control analysis. Default of 0.} + \item{min.quant}{Only quantiles above this value are used in constructing the statistic.} \item{give.pvalue}{Whether to calculate the p-value. As this can be slow it can be @@ -28,6 +31,9 @@ decimal fractions.} \item{alpha}{Signficance level of p_value() function.} +\item{case_control}{If TRUE, only calculate for samples designated cases. Otherwise +all samples are used to calculate the background distribution.} + \item{parallel}{Use the parallel package when simulating the distribution, creating the required cluster. If cluster is specified then this option makes no difference.} From 6201628c25b2d9162bc711c392da3fbb816614a9 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 29 Jun 2018 15:58:26 +0800 Subject: [PATCH 06/16] First go at speeding up simulation results by making more use of each simulation Former-commit-id: 1b65e973e3cb9ef3c287e60ddb7e8f4e063c03da --- R/private_functions.R | 26 +++++++++++++++++++------- R/tsum_test.R | 3 ++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/R/private_functions.R b/R/private_functions.R index 09908da..ada2f80 100644 --- a/R/private_functions.R +++ b/R/private_functions.R @@ -666,6 +666,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, parallel = FALSE, # perform in parallel cluster = NULL, # cluster for parallel. If NULL, then make one cluster_n = 4, # cluster size if cl is NULL + T_stat = NULL, # T_stat named vector precomputed ...) { # Derive a simulated ECDF, and return an "ecdf" class object # qmmat: a quantile matrix from the make_quantiles_matrix() function @@ -750,7 +751,9 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, } simulate_quant_statistic <- function() { simu <- simulate_quantile_matrix() - do.call(quant_statistic, c(list(simu), triple_dots)) + # Devel: this next line change completely changes the output here, for efficiency + # do.call(quant_statistic, c(list(simu), triple_dots)) + do.call(quant_statistic_sampp, c(list(simu), triple_dots)) } triple_dots <- c(list(...), list(subject_in_background = subject_in_background, @@ -799,9 +802,11 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, } } else { # not performing in parallel - sim_T <- rep(0, B) # Don't let p-value get to zero + # TODO here: + sim_T <- rep(0, B*N) # Don't let p-value get to zero for(i in seq_len(B)) { - sim_T[i] <- simulate_quant_statistic() + sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic() + # TODO: insert stopping-code here } } @@ -810,12 +815,14 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, comp_p_function <- ecdf(sim_T) # p-values - T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ... + if(is.null(T_stat)) { + T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ... + } # p <- 1 - comp_p_function(T_stat) + 1 / (B + 1) p <- rep(1.1, N) # calculate p-values for(i in seq_len(N)) { - p[i] = c(sum(sim_T > T_stat[i]) + 1) / (B + 1) + p[i] = c(sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1) } names(p) <- names(T_stat) @@ -1012,7 +1019,8 @@ sd_of_trimmed <- function(...) { # This runs the simulation with the given paramets # Use ... to pass options to simulate_ecdf_quant_statistic() # This also passes remaining options onto quant_statistic() and quant_statistic_sampp() -simulation_run <- function(data, B = 99, trim = 0.15, ...) { +simulation_run <- function(data, B = 99, trim = 0.15, + T_stats = NULL, ...) { N <- data$samples[, .N] L <- data$db[, .N] p.matrix <- matrix(nrow = N, ncol = L) @@ -1024,7 +1032,11 @@ simulation_run <- function(data, B = 99, trim = 0.15, ...) { message("Simulating distribution for ", loc) qm_loop <- make_quantiles_matrix(data, loc, read_count_quant = 1, method = "quantile7", min.n = 3) - xec <- simulate_ecdf_quant_statistic(qm_loop, B, ...) + T_stat_loc <- T_stats[locus == loc, ] + T_stat <- T_stat_loc[, tsum] + names(T_stat) <- T_stat_loc[, sample] + xec <- simulate_ecdf_quant_statistic(qm_loop, B, + T_stat = T_stat, ...) p.matrix[names(xec$p), loc] <- xec$p qmmats[[loc]] <- qm_loop xecs[[loc]] <- xec diff --git a/R/tsum_test.R b/R/tsum_test.R index ea77d4f..7b98660 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -144,7 +144,8 @@ tsum_test <- function(strscore, parallel = parallel, # TRUE for cluster cluster = cluster, cluster_n = cluster_n, - trim = trim + trim = trim, + T_stats = T_stats ), error = function(x) { # list(cohort = this.cohort, p.matrix = c(0, 1), error = x) From 1af4328c4fd182d2f714c2ae151592387d4d4cde Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 29 Jun 2018 16:40:29 +0800 Subject: [PATCH 07/16] A version where the parallel and serial time-saving give similar results. Also removed some computations that were not given in the final output of tsum_test() Former-commit-id: 88fa131333032a26144b848dcc7d8ca15d6df89e --- R/private_functions.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/R/private_functions.R b/R/private_functions.R index ada2f80..b888ca6 100644 --- a/R/private_functions.R +++ b/R/private_functions.R @@ -751,8 +751,10 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, } simulate_quant_statistic <- function() { simu <- simulate_quantile_matrix() - # Devel: this next line change completely changes the output here, for efficiency - # do.call(quant_statistic, c(list(simu), triple_dots)) + do.call(quant_statistic, c(list(simu), triple_dots)) + } + simulate_quant_statistic_sampp <- function() { + simu <- simulate_quantile_matrix() do.call(quant_statistic_sampp, c(list(simu), triple_dots)) } @@ -805,14 +807,14 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, # TODO here: sim_T <- rep(0, B*N) # Don't let p-value get to zero for(i in seq_len(B)) { - sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic() + sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic_sampp() # TODO: insert stopping-code here } } - - comp_p_function <- ecdf(sim_T) + # Following is not relevant to output + # comp_p_function <- ecdf(sim_T) # p-values if(is.null(T_stat)) { @@ -822,12 +824,13 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, p <- rep(1.1, N) # calculate p-values for(i in seq_len(N)) { - p[i] = c(sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1) + p[i] = (sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1) } names(p) <- names(T_stat) # generate ECDF and output - list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat) + # list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat) + list(p = p, sim_T = sim_T, T_stat = T_stat) } From 95f52ae3d07bea38e554d3c469cc9fe57395f2ac Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 29 Jun 2018 17:06:20 +0800 Subject: [PATCH 08/16] multi sample version made. Little speed benefits have been made, so may discard the previous few changes. Former-commit-id: 01d9c556f5a0c36b6c70a303ee15d8bd2ffddf64 --- R/private_functions.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/private_functions.R b/R/private_functions.R index b888ca6..dc00a0d 100644 --- a/R/private_functions.R +++ b/R/private_functions.R @@ -789,6 +789,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, "sort_sim_qm", "N", "M", "mu_vec", "se_vec", "simulate_quant_statistic", + "simulate_quant_statistic_sampp", "simulate_quantile_matrix", "quant_statistic", "trim_vector", @@ -798,7 +799,9 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15, envir = environment() ) # Run replicate - sim_T <- parReplicate(cluster, B, simulate_quant_statistic()) + sim_T <- as.vector( + parReplicate(cluster, B, simulate_quant_statistic_sampp()) + ) if(cluster_stop) { stopCluster(cluster) } From 6bc183b3236e50a21f206f4efa5b064374573766 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Sat, 7 Jul 2018 18:36:33 +0800 Subject: [PATCH 09/16] tsum_test() now includes a standard error estimate of the p-value generated Former-commit-id: d30bb577c72bc07285d95abce7f182e1d799fbd8 --- R/tsum_test.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/tsum_test.R b/R/tsum_test.R index 7b98660..8277025 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -164,11 +164,15 @@ tsum_test <- function(strscore, pvals <- NULL } # Prepare output - exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals, + outtsum <- exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals, qmats = sim.results$qmmats, xecs = sim.results$xecs, correction = correction, alpha = alpha, args = list(trim = trim, min.quant = min.quant, B = B)) + if(give.pvalue) { + outtsum$stats[, p.value.sd := sqrt(p.value * (1 - p.value) / (B * strscore$samples[, .N]))] + } + outtsum } # test code: From 0a5a4e79c5a2b7bada8978a3d29b2d62d2e357b4 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Sun, 8 Jul 2018 18:37:52 +0800 Subject: [PATCH 10/16] Correction for SE(p.value) so that it does not equal 0 when p.value = 1 Former-commit-id: e37cb104d22c77ede5317e3bbad4eeda6347a6b9 --- R/tsum_test.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/tsum_test.R b/R/tsum_test.R index 8277025..d4520bd 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -170,7 +170,10 @@ tsum_test <- function(strscore, alpha = alpha, args = list(trim = trim, min.quant = min.quant, B = B)) if(give.pvalue) { - outtsum$stats[, p.value.sd := sqrt(p.value * (1 - p.value) / (B * strscore$samples[, .N]))] + Nsim <- B * strscore$samples[, .N] + outtsum$stats[, p.value.sd := + sqrt(p.value * ((Nsim + 2)/(Nsim + 1) - p.value) / Nsim) + ] } outtsum } From 1186e123f4c030a94e9eefe8bc6752527faa0dfe Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Mon, 9 Jul 2018 17:18:31 +0800 Subject: [PATCH 11/16] Documentation improvement, and p_values() now describes correction options better and added a number of "loci" BF correction method. Former-commit-id: a298684c52751b90776dfe84cb8c00d4a005558b --- R/p_values.R | 9 +++++++-- R/tsum_test.R | 4 ++-- examples/exSTRa_score_analysis.R | 11 +++++++---- man/p_values.Rd | 3 ++- man/tsum_test.Rd | 2 +- vignettes/exSTRa.Rmd | 13 +++++++++---- 6 files changed, 28 insertions(+), 14 deletions(-) diff --git a/R/p_values.R b/R/p_values.R index a63f425..d8ce31a 100644 --- a/R/p_values.R +++ b/R/p_values.R @@ -5,7 +5,8 @@ #' @param tsum An exstra_tsum object. #' @param correction Correction method to use. Use "bf" or TRUE for Bonferroni correction, and #' "uncorrected" or FALSE for no correction. ("bonferroni" is also acceptable). -#' "locus" is Bonferroni correction by locus. +#' "samples" is Bonferroni correction by the number of tests (samples) at each locus. +#' "loci" is Bonferroni correction by the number of loci. #' #' @param alpha Significance level alpha. #' @param only.sig If TRUE, only return significant results. @@ -60,10 +61,14 @@ p_values <- function( out.table[, signif := p.value <= alpha / n_tests ] } else if ((is.logical(correction[1]) && ! correction[1]) || correction[1] == "uncorrected") { out.table[, signif := p.value <= alpha ] - } else if (correction[1] == "locus") { + } else if (correction[1] == "samples") { out.table[!is.na(p.value), N := .N, by = locus] out.table[, signif := p.value <= alpha / N ] out.table[, N := NULL] + } else if (correction[1] == "loci") { + out.table[!is.na(p.value), N := .N, by = sample] + out.table[, signif := p.value <= alpha / N ] + out.table[, N := NULL] } else { stop("Unknown correction method ", correction[1]) } diff --git a/R/tsum_test.R b/R/tsum_test.R index d4520bd..8efd467 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -62,8 +62,8 @@ tsum_test <- function(strscore, trim.cc = 0, min.quant = 0.5, give.pvalue = TRUE, - B = 9999, - correction = c("bf", "locus", "uncorrected"), + B = 999, + correction = c("bf", "loci", "samples", "uncorrected"), alpha = 0.05, case_control = FALSE, parallel = FALSE, # TRUE for cluster diff --git a/examples/exSTRa_score_analysis.R b/examples/exSTRa_score_analysis.R index ca442f9..00feacc 100644 --- a/examples/exSTRa_score_analysis.R +++ b/examples/exSTRa_score_analysis.R @@ -60,16 +60,19 @@ plot_cols par(mfrow = c(2, 2)) # Bonferroni correction is too severe here, so we use Bonferroni correction only on each -# locus. -plot(tsum, sample_col = plot_cols, correction = "locus") +# locus for the number of samples. +plot(tsum, sample_col = plot_cols, correction = "samples") + +# Or Bonferroni correction for the number of loci tested: +plot(tsum, sample_col = plot_cols, correction = "loci") # Give a table of each sample and locus with the p-value, and if it is significant: -(ps <- p_values(tsum, correction = "locus")) +(ps <- p_values(tsum, correction = "samples")) # this may be acted on directly: ps[identity(signif)] # or with the only.signif option: -p_values(tsum, only.signif = TRUE, correction = "locus") +p_values(tsum, only.signif = TRUE, correction = "samples") # Give the best hit(s) for each sample: # TODO: what is best for display may not be the best for internal representation. diff --git a/man/p_values.Rd b/man/p_values.Rd index 8ecbc7c..f2db601 100644 --- a/man/p_values.Rd +++ b/man/p_values.Rd @@ -12,7 +12,8 @@ p_values(tsum, correction = c("bf", "locus", "uncorrected"), alpha = 0.05, \item{correction}{Correction method to use. Use "bf" or TRUE for Bonferroni correction, and "uncorrected" or FALSE for no correction. ("bonferroni" is also acceptable). -"locus" is Bonferroni correction by locus.} +"samples" is Bonferroni correction by the number of tests (samples) at each locus. +"loci" is Bonferroni correction by the number of loci.} \item{alpha}{Significance level alpha.} diff --git a/man/tsum_test.Rd b/man/tsum_test.Rd index cbb2c7a..eb35619 100644 --- a/man/tsum_test.Rd +++ b/man/tsum_test.Rd @@ -5,7 +5,7 @@ \title{Generate T sum statistics and p-values from simulation.} \usage{ tsum_test(strscore, trim = 0.15, trim.cc = 0, min.quant = 0.5, - give.pvalue = TRUE, B = 9999, correction = c("bf", "locus", + give.pvalue = TRUE, B = 999, correction = c("bf", "loci", "samples", "uncorrected"), alpha = 0.05, case_control = FALSE, parallel = FALSE, cluster_n = NULL, cluster = NULL) } diff --git a/vignettes/exSTRa.Rmd b/vignettes/exSTRa.Rmd index a7113c4..c9a0645 100644 --- a/vignettes/exSTRa.Rmd +++ b/vignettes/exSTRa.Rmd @@ -211,18 +211,23 @@ with the default number of simulations (9999). This can be adjusted with the `B` parameter of `tsum_test()`, or a less stringent threshold can be used. Bonferroni correction is too severe here, so we can specify Bonferroni correction only -on each locus. +on each locus for the number of samples tested. ```{r, out.width = '82%', fig.width=12, fig.height=12} par(mfrow = c(2, 2)) -plot(tsum, sample_col = plot_cols, correction = "locus") +plot(tsum, sample_col = plot_cols, correction = "samples") ``` +Or Bonferroni correction may be applied for the number of loci tested. +```{r, out.width = '82%', fig.width=12, fig.height=12} +par(mfrow = c(2, 2)) +plot(tsum, sample_col = plot_cols, correction = "loci") +``` You may obtain a data.table of each sample and locus with the p-value, and if it is significant with the correction method applied. Here, the correction method is Bonferroni per locus. ```{r} -(ps <- p_values(tsum, correction = "locus")) +(ps <- p_values(tsum, correction = "samples")) ``` To obtain only the significant samples, you can either use data.table subsetting: @@ -232,7 +237,7 @@ ps[identity(signif)] or when retrieving the data.table from p_values(): ```{r} -p_values(tsum, only.signif = TRUE, correction = "locus") +p_values(tsum, only.signif = TRUE, correction = "samples") ``` From ca6b9390986e37efb5b239af14691dca5d306595 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 13 Jul 2018 14:53:32 +0800 Subject: [PATCH 12/16] tsum_test() removes simulation results by default now. Updated documentation for p_values(), tsum_test(). Former-commit-id: 5ce2d8dfac1871859f3b1f328ed0864dba047f32 --- DESCRIPTION | 4 ++-- R/p_values.R | 6 +++++- R/tsum_test.R | 11 ++++++++++- man/p_values.Rd | 6 +++++- man/tsum_test.Rd | 5 ++++- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7fbb0c7..a14f226 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: exSTRa Type: Package Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data -Version: 0.85 -Date: 2018-06-06 +Version: 0.85.1 +Date: 2018-07-1x Author: Rick Tankard Maintainer: Rick Tankard Description: Detecting expansions with paired-end Illumina sequencing data. diff --git a/R/p_values.R b/R/p_values.R index d8ce31a..1807264 100644 --- a/R/p_values.R +++ b/R/p_values.R @@ -12,7 +12,11 @@ #' @param only.sig If TRUE, only return significant results. #' @param modify If TRUE, will modify the tsum$stats table. Effectively ignored if only.sig == TRUE. #' @param p.matrix Matrix of p-values for internal use. Should only be used without tsum. -#' @return A data.table +#' @return A \code{data.table} keyed by "locus" then "sample". +#' Other columns are \code{tsum} as calculated by \code{\link{tsum_test}}, \code{p.value} (uncorrected), +#' \code{signif} (TRUE if significant after given correction), and +#' \code{p.value.sd}, giving the standard deviation of the p-value estimate from the +#' simulation. #' #' @import magrittr #' @import testit diff --git a/R/tsum_test.R b/R/tsum_test.R index 8efd467..059b436 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -30,6 +30,9 @@ #' If cluster is specified then this option makes no difference. #' @param cluster A cluster object from the parallel package. Use if you wish to set up #' the cluster yourself or reuse an existing cluster. +#' @param keep.sim.tsum For inspection of simulations. +#' If TRUE, keep all simulation Tsum statistics in output$xecs (default FALSE). +#' #' @return An exstra_tsum object with T statistics and p-values (if calculated). #' #' @seealso \code{\link{tsum_p_value_summary}} @@ -68,8 +71,9 @@ tsum_test <- function(strscore, case_control = FALSE, parallel = FALSE, # TRUE for cluster cluster_n = NULL, # Cluster size if cluster == NULL. When NULL, #threads - 1 (but always at least 1) - cluster = NULL # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a + cluster = NULL, # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a # PSOCK cluster is automatically created with the parallel package. + keep.sim.tsum = FALSE ) { # Check inputs @@ -175,6 +179,11 @@ tsum_test <- function(strscore, sqrt(p.value * ((Nsim + 2)/(Nsim + 1) - p.value) / Nsim) ] } + if(! keep.sim.tsum) { + for(i in seq_along(tsum$xecs)) { + tsum$xecs[[i]]$sim_T <- NULL + } + } outtsum } diff --git a/man/p_values.Rd b/man/p_values.Rd index f2db601..e55347b 100644 --- a/man/p_values.Rd +++ b/man/p_values.Rd @@ -24,7 +24,11 @@ p_values(tsum, correction = c("bf", "locus", "uncorrected"), alpha = 0.05, \item{only.sig}{If TRUE, only return significant results.} } \value{ -A data.table +A \code{data.table} keyed by "locus" then "sample". + Other columns are \code{tsum} as calculated by \code{\link{tsum_test}}, \code{p.value} (uncorrected), + \code{signif} (TRUE if significant after given correction), and + \code{p.value.sd}, giving the standard deviation of the p-value estimate from the + simulation. } \description{ Return a data.table with p-values of a tsum_exstra object. diff --git a/man/tsum_test.Rd b/man/tsum_test.Rd index eb35619..07eb5c9 100644 --- a/man/tsum_test.Rd +++ b/man/tsum_test.Rd @@ -7,7 +7,7 @@ tsum_test(strscore, trim = 0.15, trim.cc = 0, min.quant = 0.5, give.pvalue = TRUE, B = 999, correction = c("bf", "loci", "samples", "uncorrected"), alpha = 0.05, case_control = FALSE, parallel = FALSE, - cluster_n = NULL, cluster = NULL) + cluster_n = NULL, cluster = NULL, keep.sim.tsum = FALSE) } \arguments{ \item{strscore}{An exstra_score object.} @@ -47,6 +47,9 @@ If cluster is specified then this option makes no difference.} \item{cluster}{A cluster object from the parallel package. Use if you wish to set up the cluster yourself or reuse an existing cluster.} + +\item{keep.sim.tsum}{For inspection of simulations. +If TRUE, keep all simulation Tsum statistics in output$xecs (default FALSE).} } \value{ An exstra_tsum object with T statistics and p-values (if calculated). From cd38895a7dcdbce416ff3f6490eaf5a93cdffe26 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 13 Jul 2018 15:09:32 +0800 Subject: [PATCH 13/16] bug fix, changing incorrect object Former-commit-id: a6d66af023672105337adef94db51063d8a62c76 --- R/tsum_test.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tsum_test.R b/R/tsum_test.R index 059b436..7abfd9d 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -181,7 +181,7 @@ tsum_test <- function(strscore, } if(! keep.sim.tsum) { for(i in seq_along(tsum$xecs)) { - tsum$xecs[[i]]$sim_T <- NULL + outtsum$xecs[[i]]$sim_T <- NULL } } outtsum From ca8b8bf4a463175cb9eaf4b77b72c651e2273fb7 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 13 Jul 2018 15:26:28 +0800 Subject: [PATCH 14/16] bug fix in tsum_test(), incorrect object name Former-commit-id: 7d39a924369dd855fedeafb26d1b213f0b7f4964 --- R/tsum_test.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tsum_test.R b/R/tsum_test.R index 7abfd9d..2c59d07 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -180,7 +180,7 @@ tsum_test <- function(strscore, ] } if(! keep.sim.tsum) { - for(i in seq_along(tsum$xecs)) { + for(i in seq_along(outtsum$xecs)) { outtsum$xecs[[i]]$sim_T <- NULL } } From fb5cca318fbd8f7df014e22d44609df555a5fba0 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 13 Jul 2018 15:53:48 +0800 Subject: [PATCH 15/16] Updated version number for release Former-commit-id: b59c67bf6cdec81149b2b37a5cab0ad0c6791e25 --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a14f226..6fa6611 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: exSTRa Type: Package Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data -Version: 0.85.1 -Date: 2018-07-1x +Version: 0.86 +Date: 2018-07-13 Author: Rick Tankard Maintainer: Rick Tankard Description: Detecting expansions with paired-end Illumina sequencing data. From 7448c22642374d1cf18914854a484e4946c88c34 Mon Sep 17 00:00:00 2001 From: Rick Tankard Date: Fri, 13 Jul 2018 16:13:36 +0800 Subject: [PATCH 16/16] Had to add stop message to p-value calculation (not yet implemented) Former-commit-id: ff2e90fe8dc7bc697757545634e8d50ea552f3c7 --- R/tsum_test.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/tsum_test.R b/R/tsum_test.R index 2c59d07..ac2c669 100644 --- a/R/tsum_test.R +++ b/R/tsum_test.R @@ -88,6 +88,11 @@ tsum_test <- function(strscore, is.null(cluster) || inherits(cluster, "cluster")) assert("cluster_n should be at least 1 and a whole-number.", is.null(cluster_n) || cluster_n >= 1) + # temp code until this is developed + if(case_control && give.pvalue) { + stop("tsum_test() cannot yet use cases and controls to generate p-values from simulation.") + } + if(parallel) { n_cores <- detectCores(all.tests = FALSE, logical = TRUE) if(is.null(cluster_n)) {