Skip to content

Commit

Permalink
Merge pull request #35 from bahlolab/devel_tankard
Browse files Browse the repository at this point in the history
Improvements in tsum_test() and p_values(). 

Former-commit-id: 3ffd432
  • Loading branch information
trickytank authored Jul 13, 2018
2 parents 7181615 + 7448c22 commit 47b45cb
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 46 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: exSTRa
Type: Package
Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
Version: 0.85
Date: 2018-06-06
Version: 0.86
Date: 2018-07-13
Author: Rick Tankard
Maintainer: Rick Tankard <[email protected]>
Description: Detecting expansions with paired-end Illumina sequencing data.
Expand Down
6 changes: 3 additions & 3 deletions R/CLASS_exstra_score.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL
}
}
if(!is.null(groups.samples)) {
# using regex for groups
# Specify the group of samples directly via groups.samples
assert("groups.samples must be a list if used, with vectors with names of at least one of 'case', 'control' or 'null'.",
is.list(groups.samples),
length(groups.samples) > 0,
Expand All @@ -55,7 +55,7 @@ strs_read_ <- function(file, database, groups.regex = NULL, groups.samples = NUL
groups_all[sample == counts$sample] <- "case"
}
} else {
groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.regex))
groups_all <- factor(rep(NA, dim(counts)[1]), levels = names(groups.samples))
for(group.name in names(groups.samples)) {
for(sample in groups.samples[[group.name]]) {
groups_all[sample == counts$sample] <- group.name
Expand Down Expand Up @@ -341,4 +341,4 @@ as.exstra_score <- function(x, copy = FALSE) {
x <- copy.exstra_score(x)
}
structure(list(data = x$data, db = x$db, input_type = x$input_type, samples = x$samples), class = c("exstra_score", "exstra_db"))
}
}
15 changes: 12 additions & 3 deletions R/p_values.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@
#' @param tsum An exstra_tsum object.
#' @param correction Correction method to use. Use "bf" or TRUE for Bonferroni correction, and
#' "uncorrected" or FALSE for no correction. ("bonferroni" is also acceptable).
#' "locus" is Bonferroni correction by locus.
#' "samples" is Bonferroni correction by the number of tests (samples) at each locus.
#' "loci" is Bonferroni correction by the number of loci.
#'
#' @param alpha Significance level alpha.
#' @param only.sig If TRUE, only return significant results.
#' @param modify If TRUE, will modify the tsum$stats table. Effectively ignored if only.sig == TRUE.
#' @param p.matrix Matrix of p-values for internal use. Should only be used without tsum.
#' @return A data.table
#' @return A \code{data.table} keyed by "locus" then "sample".
#' Other columns are \code{tsum} as calculated by \code{\link{tsum_test}}, \code{p.value} (uncorrected),
#' \code{signif} (TRUE if significant after given correction), and
#' \code{p.value.sd}, giving the standard deviation of the p-value estimate from the
#' simulation.
#'
#' @import magrittr
#' @import testit
Expand Down Expand Up @@ -60,10 +65,14 @@ p_values <- function(
out.table[, signif := p.value <= alpha / n_tests ]
} else if ((is.logical(correction[1]) && ! correction[1]) || correction[1] == "uncorrected") {
out.table[, signif := p.value <= alpha ]
} else if (correction[1] == "locus") {
} else if (correction[1] == "samples") {
out.table[!is.na(p.value), N := .N, by = locus]
out.table[, signif := p.value <= alpha / N ]
out.table[, N := NULL]
} else if (correction[1] == "loci") {
out.table[!is.na(p.value), N := .N, by = sample]
out.table[, signif := p.value <= alpha / N ]
out.table[, N := NULL]
} else {
stop("Unknown correction method ", correction[1])
}
Expand Down
79 changes: 60 additions & 19 deletions R/private_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -558,11 +558,15 @@ quant_statistic <- function(qmmat, sample = 1, quant = 0.5, trim = 0.15,
cumsum(rev(ts))[qs] / qs # we divide by the number of t statistics being summed
}

quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) {
quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL,
case_samples = NULL,
...) {
# get the quantile statistic for multiple samples
# qmmat: a quantile matrix from the make_quantiles_matrix() function
# sample: samples to get the statistic of. If NULL, give all samples in qmmat
# qs: keeps the top number of quantiles, you probably do not want to use this
# case_samples: if not NULL, only calculate for these samples in case-control setting.
# Other cases are excluded in each calculation.
# ... further arguments to quant_statistic(), most interesting is:
# quant keeps quantiles above its value only when qs is not specified. The default
# keeps all values above the median (quant = 0.5)
Expand All @@ -575,16 +579,35 @@ quant_statistic_sampp <- function(qmmat, sample = NULL, qs = NULL, ...) {
assert("sample is not a character, numeric or null", is.null(sample) || is.character(sample) || is.numeric(sample))
assert("qs is not numeric", is.null(qs) || is.numeric(qs))
assert("qs is not single", is.null(qs) || length(qs) == 1)
if(is.null(sample)) {
sample <- seq_len(dim(qmmat)[1])
}
t_out <- rep(NA, length(sample))

ti <- 0
for(samp in sample) {
ti <- ti + 1
t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...)
if(!is.null(case_samples)) {
if(is.null(sample)) {
sample <- case_samples
}
assert("All case samples should be in qmmat", all(case_samples %in% rownames(qmmat)))
t_out <- rep(NA, length(sample) - length(case_samples))
control_samples <- rownames(qmmat)[! rownames(qmmat) %in% case_samples]
for(samp in sample) {
ti <- ti + 1
# Get only the correct qmmat columns
qmmat0 <- qmmat[c(samp, control_samples),]
t_out[ti] <- quant_statistic(qmmat0, sample = samp, qs = qs,
subject_in_background = FALSE, ...) # TODO: maybe trim = 0?
}
names(t_out) <- sample
} else {
# No samples marked explicitly as cases
if(is.null(sample)) {
sample <- seq_len(dim(qmmat)[1])
}
t_out <- rep(NA, length(sample))
for(samp in sample) {
ti <- ti + 1
t_out[ti] <- quant_statistic(qmmat, sample = samp, qs = qs, ...)
}
names(t_out) <- rownames(qmmat) # may be a bug if only some samples are required
}
names(t_out) <- rownames(qmmat)
t_out
}

Expand Down Expand Up @@ -643,6 +666,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
parallel = FALSE, # perform in parallel
cluster = NULL, # cluster for parallel. If NULL, then make one
cluster_n = 4, # cluster size if cl is NULL
T_stat = NULL, # T_stat named vector precomputed
...) {
# Derive a simulated ECDF, and return an "ecdf" class object
# qmmat: a quantile matrix from the make_quantiles_matrix() function
Expand Down Expand Up @@ -729,6 +753,10 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
simu <- simulate_quantile_matrix()
do.call(quant_statistic, c(list(simu), triple_dots))
}
simulate_quant_statistic_sampp <- function() {
simu <- simulate_quantile_matrix()
do.call(quant_statistic_sampp, c(list(simu), triple_dots))
}

triple_dots <- c(list(...), list(subject_in_background = subject_in_background,
use_truncated_sd = use_truncated_sd_in_quant_statistic,
Expand Down Expand Up @@ -761,6 +789,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
"sort_sim_qm",
"N", "M", "mu_vec", "se_vec",
"simulate_quant_statistic",
"simulate_quant_statistic_sampp",
"simulate_quantile_matrix",
"quant_statistic",
"trim_vector",
Expand All @@ -770,34 +799,41 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
envir = environment()
)
# Run replicate
sim_T <- parReplicate(cluster, B, simulate_quant_statistic())
sim_T <- as.vector(
parReplicate(cluster, B, simulate_quant_statistic_sampp())
)
if(cluster_stop) {
stopCluster(cluster)
}
} else {
# not performing in parallel
sim_T <- rep(0, B) # Don't let p-value get to zero
# TODO here:
sim_T <- rep(0, B*N) # Don't let p-value get to zero
for(i in seq_len(B)) {
sim_T[i] <- simulate_quant_statistic()
sim_T[((i-1)*N + 1) : (i*N) ] <- simulate_quant_statistic_sampp()
# TODO: insert stopping-code here
}
}



comp_p_function <- ecdf(sim_T)
# Following is not relevant to output
# comp_p_function <- ecdf(sim_T)

# p-values
T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ...
if(is.null(T_stat)) {
T_stat <- quant_statistic_sampp(qmmat, trim = trim, subject_in_background = subject_in_background, use_truncated = use_truncated_sd_in_quant_statistic) # TODO add ...
}
# p <- 1 - comp_p_function(T_stat) + 1 / (B + 1)
p <- rep(1.1, N)
# calculate p-values
for(i in seq_len(N)) {
p[i] = c(sum(sim_T > T_stat[i]) + 1) / (B + 1)
p[i] = (sum(sim_T > T_stat[i]) + 1) / (length(sim_T) + 1)
}
names(p) <- names(T_stat)

# generate ECDF and output
list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat)
# list(ecdf = comp_p_function, p = p, sim_T = sim_T, T_stat = T_stat)
list(p = p, sim_T = sim_T, T_stat = T_stat)
}


Expand Down Expand Up @@ -989,7 +1025,8 @@ sd_of_trimmed <- function(...) {
# This runs the simulation with the given paramets
# Use ... to pass options to simulate_ecdf_quant_statistic()
# This also passes remaining options onto quant_statistic() and quant_statistic_sampp()
simulation_run <- function(data, B = 99, trim = 0.15, ...) {
simulation_run <- function(data, B = 99, trim = 0.15,
T_stats = NULL, ...) {
N <- data$samples[, .N]
L <- data$db[, .N]
p.matrix <- matrix(nrow = N, ncol = L)
Expand All @@ -1001,7 +1038,11 @@ simulation_run <- function(data, B = 99, trim = 0.15, ...) {
message("Simulating distribution for ", loc)
qm_loop <- make_quantiles_matrix(data, loc, read_count_quant = 1,
method = "quantile7", min.n = 3)
xec <- simulate_ecdf_quant_statistic(qm_loop, B, ...)
T_stat_loc <- T_stats[locus == loc, ]
T_stat <- T_stat_loc[, tsum]
names(T_stat) <- T_stat_loc[, sample]
xec <- simulate_ecdf_quant_statistic(qm_loop, B,
T_stat = T_stat, ...)
p.matrix[names(xec$p), loc] <- xec$p
qmmats[[loc]] <- qm_loop
xecs[[loc]] <- xec
Expand Down
46 changes: 40 additions & 6 deletions R/tsum_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @param trim Trim this proportion of data points at each quantile
#' level (rounded up). Must be at least 0 and less than 0.5, but values close
#' to 0.5 may remove all samples and hence result in an error.
#' @param trim.cc Trim value used in case-control analysis. Default of 0.
#' @param min.quant Only quantiles above this value are used in constructing the statistic.
#' @param give.pvalue Whether to calculate the p-value. As this can be slow it can be
#' useful to turn off if only the t sum statistics are required.
Expand All @@ -16,6 +17,8 @@
#' decimal fractions.
#' @param correction Correction method of p_value() function.
#' @param alpha Signficance level of p_value() function.
#' @param case_control If TRUE, only calculate for samples designated cases. Otherwise
#' all samples are used to calculate the background distribution.
#' @param parallel Use the parallel package when simulating the distribution, creating the
#' required cluster.
#' If cluster is specified then this option makes no difference.
Expand All @@ -27,6 +30,9 @@
#' If cluster is specified then this option makes no difference.
#' @param cluster A cluster object from the parallel package. Use if you wish to set up
#' the cluster yourself or reuse an existing cluster.
#' @param keep.sim.tsum For inspection of simulations.
#' If TRUE, keep all simulation Tsum statistics in output$xecs (default FALSE).
#'
#' @return An exstra_tsum object with T statistics and p-values (if calculated).
#'
#' @seealso \code{\link{tsum_p_value_summary}}
Expand Down Expand Up @@ -56,15 +62,18 @@
#' @export
tsum_test <- function(strscore,
trim = 0.15,
trim.cc = 0,
min.quant = 0.5,
give.pvalue = TRUE,
B = 9999,
correction = c("bf", "locus", "uncorrected"),
B = 999,
correction = c("bf", "loci", "samples", "uncorrected"),
alpha = 0.05,
case_control = FALSE,
parallel = FALSE, # TRUE for cluster
cluster_n = NULL, # Cluster size if cluster == NULL. When NULL, #threads - 1 (but always at least 1)
cluster = NULL # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a
cluster = NULL, # As created by the parallel package. If cluster == NULL and parallel == TRUE, then a
# PSOCK cluster is automatically created with the parallel package.
keep.sim.tsum = FALSE
)
{
# Check inputs
Expand All @@ -79,6 +88,11 @@ tsum_test <- function(strscore,
is.null(cluster) || inherits(cluster, "cluster"))
assert("cluster_n should be at least 1 and a whole-number.", is.null(cluster_n) || cluster_n >= 1)

# temp code until this is developed
if(case_control && give.pvalue) {
stop("tsum_test() cannot yet use cases and controls to generate p-values from simulation.")
}

if(parallel) {
n_cores <- detectCores(all.tests = FALSE, logical = TRUE)
if(is.null(cluster_n)) {
Expand All @@ -105,7 +119,14 @@ tsum_test <- function(strscore,
# message("Generating T sum statistics for ", loc)
qm <- make_quantiles_matrix(strscore, loc = loc, sample = NULL, read_count_quant = 1,
method = "quantile7", min.n = 3)
T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5
if(case_control) {
# Only calculating for case samples
T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim.cc,
case_samples = strscore$samples[group == "case", sample])
} else {
# Using all samples as the background:
T_stats_loc <- quant_statistic_sampp(qm, quant = min.quant, trim = trim) # quant at default of 0.5
}
T_stats_list[[loc]] <- data.table(sample = names(T_stats_loc), tsum = T_stats_loc)
}
T_stats <- rbindlist(T_stats_list, idcol = "locus")
Expand All @@ -132,7 +153,8 @@ tsum_test <- function(strscore,
parallel = parallel, # TRUE for cluster
cluster = cluster,
cluster_n = cluster_n,
trim = trim
trim = trim,
T_stats = T_stats
),
error = function(x) {
# list(cohort = this.cohort, p.matrix = c(0, 1), error = x)
Expand All @@ -151,11 +173,23 @@ tsum_test <- function(strscore,
pvals <- NULL
}
# Prepare output
exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals,
outtsum <- exstra_tsum_new_(strscore, tsum = T_stats, p.values = pvals,
qmats = sim.results$qmmats, xecs = sim.results$xecs,
correction = correction,
alpha = alpha,
args = list(trim = trim, min.quant = min.quant, B = B))
if(give.pvalue) {
Nsim <- B * strscore$samples[, .N]
outtsum$stats[, p.value.sd :=
sqrt(p.value * ((Nsim + 2)/(Nsim + 1) - p.value) / Nsim)
]
}
if(! keep.sim.tsum) {
for(i in seq_along(outtsum$xecs)) {
outtsum$xecs[[i]]$sim_T <- NULL
}
}
outtsum
}

# test code:
Expand Down
11 changes: 7 additions & 4 deletions examples/exSTRa_score_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,19 @@ plot_cols

par(mfrow = c(2, 2))
# Bonferroni correction is too severe here, so we use Bonferroni correction only on each
# locus.
plot(tsum, sample_col = plot_cols, correction = "locus")
# locus for the number of samples.
plot(tsum, sample_col = plot_cols, correction = "samples")

# Or Bonferroni correction for the number of loci tested:
plot(tsum, sample_col = plot_cols, correction = "loci")

# Give a table of each sample and locus with the p-value, and if it is significant:
(ps <- p_values(tsum, correction = "locus"))
(ps <- p_values(tsum, correction = "samples"))

# this may be acted on directly:
ps[identity(signif)]
# or with the only.signif option:
p_values(tsum, only.signif = TRUE, correction = "locus")
p_values(tsum, only.signif = TRUE, correction = "samples")

# Give the best hit(s) for each sample:
# TODO: what is best for display may not be the best for internal representation.
Expand Down
9 changes: 7 additions & 2 deletions man/p_values.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 47b45cb

Please sign in to comment.