diff --git a/R/cancerSeqStudy.R b/R/cancerSeqStudy.R index b7d496e..cfdc1ed 100644 --- a/R/cancerSeqStudy.R +++ b/R/cancerSeqStudy.R @@ -24,295 +24,9 @@ suppressPackageStartupMessages(library(VGAM)) suppressPackageStartupMessages(library(reshape2)) suppressPackageStartupMessages(library(parallel)) -#' calculates the power in a binomial power model -#' for significantly mutated genes -#' -#' @param my.mu per base rate of mutation for binomial -#' @param N vector of sample sizes -#' @param r effect size for power analysis -#' @param signif.level alpha level for power analysis -#' @return vector containing power for each sample size -smg.binom.power <- function(my.mu, - N, - Leff=1500*3/4, - r=.02, - signif.level=5e-6){ - # examine power of binomial test - # first find critical value based on binomial distribution - # Calculate power for various sizes with different effects - muEffect <- 1 - ((1-my.mu)^Leff - r)^(1/Leff) - power <- c() - falsePositives <- c() - for(i in N){ - # step one, find critical threshold - j <- 1 - while(j){ - pval <- 1-pbinom(j-1, Leff*i, my.mu) - if(pval <= signif.level){ - Xc <- j - break - } - j <- j+1 - } - - # step two, calculate power - p <- 1-pbinom(Xc-1, Leff*i, muEffect) - power <- c(power, p) - - } - - return(power) -} - - -#' calculates the false positives in a binomial model -#' for identifying significantly mutated genes if -#' there is over-diserspion. -#' -#' @param my.alpha alpha parameter for beta binomial -#' @param my.beta beta parameter for beta binomial -#' @param N vector of # samples to calculate power for -#' @param Leff effective gene length in bases -#' @param num.genes number of genes that are tested -#' @param signif.level alpha level for power analysis -smg.binom.false.pos <- function(my.alpha, my.beta, - N, Leff=1500*3/4, - num.genes=18500, - signif.level=5e-6){ - # calculate mutation rate from alpha/beta - my.mu <- my.alpha / (my.alpha + my.beta) - # examine power of binomial test - # first find critical value based on binomial distribution - power <- c() - falsePositives <- c() - for(i in N){ - # step one, find critical threshold - j <- 1 - while(j){ - pval <- 1-pbinom(j-1, Leff*i, my.mu) - if(pval <= signif.level){ - Xc <- j - break - } - j <- j+1 - } - - # step two, calculate false positives if overdispersion - fp <- 1 - pbetabinom.ab(Xc-1, Leff*i, my.alpha, my.beta) - falsePositives <- c(falsePositives, num.genes*fp) - } - return(falsePositives) -} - -#' calculates the false positives for a binomial model of -#' a ratio-metric feature. -#' -#' @param my.alpha alpha parameter for beta binomial -#' @param my.beta beta parameter for beta binomial -#' @param N vector of # samples to calculate power for -#' @param mu mutation rate per base -#' @param L gene CDS length in bases -#' @param num.genes number of genes that are tested -#' @param signif.level alpha level for power analysis -ratiometric.binom.false.pos <- function(my.alpha, my.beta, - N, mu, L=1500, - num.genes=18500, - signif.level=5e-6){ - # calculate the ratio-metric fraction from alpha and beta - p <- my.alpha / (my.alpha + my.beta) - # examine power of binomial test - # first find critical value based on binomial distribution - power <- c() - falsePositives <- c() - for(i in N){ - # step one, find the # of mutations where - # it is expected to occur at least 90% of the time - #j <- 1 - #while(j){ - # prob <- pbinom(j-1, L*i, mu) - # if(prob >= .1){ - # mutEff <- j - # break - # } - # j <- j+1 - #} - mutEff <- ceiling(L*i*mu) - - # step one, find critical threshold - j <- 1 - while(j){ - pval <- 1-pbinom(j-1, mutEff, p) - if(pval <= signif.level){ - Xc <- j - break - } - j <- j+1 - } - - # step two, calculate false positives if overdispersion - fp <- 1 - pbetabinom.ab(Xc-1, mutEff, my.alpha, my.beta) - falsePositives <- c(falsePositives, num.genes*fp) - } - return(falsePositives) -} - -#' calculates the power in a beta-binomial model for -#' significantly mutated genes. -#' -#' @param my.alpha alpha parameter for beta binomial -#' @param my.beta beta parameter for beta binomial -#' @param N maximum number of sample to calculate power for -#' @param Leff effective gene length in bases -#' @param r effect size for power analysis -#' @param signif.level alpha level for power analysis -smg.bbd.power <- function(my.alpha, my.beta, - N, - Leff=1500*3/4, - r=.02, - signif.level=5e-6){ - # calc the mutation rate from alpha/beta - my.mu <- my.alpha / (my.alpha + my.beta) - # examine power of binomial test - # first find critical value based on binomial distribution - # Calculate power for various sizes with different effects - muEffect <- 1 - ((1-my.mu)^Leff - r)^(1/Leff) - power <- c() - falsePositives <- c() - for(i in N){ - # step one, find critical threshold - j <- 1 - while(j){ - pval <- 1-pbetabinom.ab(j-1, Leff*i, my.alpha, my.beta) - if(pval <= signif.level){ - Xc <- j - break - } - j <- j+1 - } - - # step two, calculate power - p <- 1-pbinom(Xc-1, Leff*i, muEffect) - power <- c(power, p) - - } - return(power) -} - -#' calculates the power in a binomial power model -#' for ratio-metric approach -#' -#' @param p background proportion of total mutations falling into specific category -#' @param N vector of sample sizes -#' @param mu per base rate of mutation -#' @param r effect size for power analysis -#' @param signif.level alpha level for power analysis -#' @return vector containing power for each sample size -ratiometric.binom.power <- function(p, N, mu, - L=1500, r=.02, - signif.level=5e-6){ - # figure out the target mutation rate for effect size is - muEffect <- 1 - ((1-mu)^(L) - r)^(1/L) - # Calculate the discrepancy between the background and - # target effect size - muDiff <- muEffect - mu - # given the mutation rates calculate the target effect - # size for a ratio-metric method - pEffect <- (mu*p + muDiff) / muEffect - - # iterate over the number of samples - power <- c() - for(i in N){ - # step one, find the # of mutations where - # it is expected to occur at least 90% of the time - j <- 1 - while(j){ - prob <- pbinom(j-1, L*i, muEffect) - if(prob >= .1){ - mutEff <- j - break - } - j <- j+1 - } - - # step two, find critical threshold - j <- 1 - while(j){ - pval <- 1-pbinom(j-1, mutEff, p) - if(pval <= signif.level){ - Xc <- j - break - } - j <- j+1 - } - - # step three, calculate power - prob <- 1-pbinom(Xc-1, mutEff, pEffect) - power <- c(power, prob) - } - return(power) -} - -#' Calculates the power in a ratio-metric approach using -#' a beta-binomial power model. -#' -#' The alpha and beta parameterize a proportion out of the -#' total mutations in a gene, rather than a mutation rate per base. -#' -#' @param my.alpha alpha parameter for beta binomial -#' @param my.beta beta parameter for beta binomial -#' @param N vector of sample sizes -#' @param mu per base rate of mutation -#' @param r effect size for power analysis -#' @param signif.level alpha level for power analysis -#' @return vector containing power for each sample size -ratiometric.bbd.power <- function(my.alpha, my.beta, - N, mu, - L=1500, r=.02, - signif.level=5e-6){ - # figure out what the ratio-metric probability is from - # the alpha and beta parameters - p <- my.alpha / (my.alpha + my.beta) - # figure out the target mutation rate for effect size is - muEffect <- 1 - ((1-mu)^(L) - r)^(1/L) - # Calculate the discrepancy between the background and - # target effect size - muDiff <- muEffect - mu - # given the mutation rates calculate the target effect - # size for a ratio-metric method - pEffect <- (mu*p + muDiff) / muEffect - - # iterate over the number of samples - power <- c() - for(i in N){ - # step one, find the # of mutations where - # it is expected to occur at least 90% of the time - j <- 1 - while(j){ - prob <- pbinom(j-1, L*i, muEffect) - if(prob >= .1){ - mutEff <- j - break - } - j <- j+1 - } - - # step two, find critical threshold - j <- 1 - while(j){ - pval <- 1-pbetabinom.ab(j-1, mutEff, my.alpha, my.beta) - if(pval <= signif.level){ - Xc <- j - break - } - j <- j+1 - } - - # step three, calculate power - prob <- 1-pbinom(Xc-1, mutEff, pEffect) - power <- c(power, prob) - } - return(power) -} +# load other R files +source("smg.R") +source("ratioMetric.R") ############################# # Convert a rate and coefficient @@ -333,290 +47,6 @@ rateCvToAlphaBeta <- function(rate, cv) { return(list(alpha=my.alpha, beta=my.beta)) } -################### -# Functions to calculate the required sample size -################## - -#' Calculates the smallest sample size to detect driver genes for which -#' there is sufficient power using a beta-binomial model. -#' -#' Effect size is measures as the fraction of sample/patient cancers with a non-silent -#' mutation in a driver gene above the background mutation rate. -#' -#' @param desired.power A floating point number indicating desired power -#' @param mu Mutation rate per base -#' @param cv Coefficient of Variation surrounding the uncertaintly in mutation rate -#' @param possible.samp.sizes vector of possible number of cancer samples in study -#' @param effect.size fraction of samples above background mutation rate -#' @param signif.level significance level for binomial test -#' @param Leff effective gene length of CDS in bases for an average gene -#' @return List containing the smallest effect size with sufficient power -smgBbdRequiredSampleSize <- function(desired.power, mu, cv, possible.samp.sizes, - effect.size, signif.level=5e-6, Leff=1500*3/4){ - # get alpha and beta parameterization - # for beta-binomial - params <- rateCvToAlphaBeta(mu, cv) - - # calc power - power.result.bbd <- smg.bbd.power(params$alpha, params$beta, possible.samp.sizes, Leff, - signif.level=signif.level, r=effect.size) - - # find min/max samples to achieve desired power - bbd.samp.size.min <- possible.samp.sizes[min(which(power.result.bbd>=desired.power))] - bbd.samp.size.max <- possible.samp.sizes[max(which(power.result.bbd=desired.power))] - binom.samp.size.max <- possible.samp.sizes[max(which(power.result.binom=desired.power))] - ratiometric.samp.size.max <- possible.samp.sizes[max(which(power.result.ratio=desired.power))] - ratiometric.samp.size.max <- possible.samp.sizes[max(which(power.result.ratio=desired.power))] - bbd.eff.size.max <- possible.effect.sizes[max(which(pow.vec=desired.power))] - binom.eff.size.max <- possible.effect.sizes[max(which(pow.vec=desired.power))] - binom.eff.size.max <- possible.effect.sizes[max(which(pow.vec=desired.power))] - bbd.eff.size.max <- possible.effect.sizes[max(which(pow.vec= .1){ + mutEff <- j + break + } + j <- j+1 + } + + # step two, find critical threshold + j <- 1 + while(j){ + pval <- 1-pbinom(j-1, mutEff, p) + if(pval <= signif.level){ + Xc <- j + break + } + j <- j+1 + } + + # step three, calculate power + prob <- 1-pbinom(Xc-1, mutEff, pEffect) + power <- c(power, prob) + } + return(power) +} + +#' Calculates the power in a ratio-metric approach using +#' a beta-binomial power model. +#' +#' The alpha and beta parameterize a proportion out of the +#' total mutations in a gene, rather than a mutation rate per base. +#' +#' @param my.alpha alpha parameter for beta binomial +#' @param my.beta beta parameter for beta binomial +#' @param N vector of sample sizes +#' @param mu per base rate of mutation +#' @param r effect size for power analysis +#' @param signif.level alpha level for power analysis +#' @return vector containing power for each sample size +ratiometric.bbd.power <- function(my.alpha, my.beta, + N, mu, + L=1500, r=.02, + signif.level=5e-6){ + # figure out what the ratio-metric probability is from + # the alpha and beta parameters + p <- my.alpha / (my.alpha + my.beta) + # figure out the target mutation rate for effect size is + muEffect <- 1 - ((1-mu)^(L) - r)^(1/L) + # Calculate the discrepancy between the background and + # target effect size + muDiff <- muEffect - mu + # given the mutation rates calculate the target effect + # size for a ratio-metric method + pEffect <- (mu*p + muDiff) / muEffect + + # iterate over the number of samples + power <- c() + for(i in N){ + # step one, find the # of mutations where + # it is expected to occur at least 90% of the time + j <- 1 + while(j){ + prob <- pbinom(j-1, L*i, muEffect) + if(prob >= .1){ + mutEff <- j + break + } + j <- j+1 + } + + # step two, find critical threshold + j <- 1 + while(j){ + pval <- 1-pbetabinom.ab(j-1, mutEff, my.alpha, my.beta) + if(pval <= signif.level){ + Xc <- j + break + } + j <- j+1 + } + + # step three, calculate power + prob <- 1-pbinom(Xc-1, mutEff, pEffect) + power <- c(power, prob) + } + return(power) +} + +################################## +# Estimated false postives +################################## + +#' calculates the false positives for a binomial model of +#' a ratio-metric feature. +#' +#' @param my.alpha alpha parameter for beta binomial +#' @param my.beta beta parameter for beta binomial +#' @param N vector of # samples to calculate power for +#' @param mu mutation rate per base +#' @param L gene CDS length in bases +#' @param num.genes number of genes that are tested +#' @param signif.level alpha level for power analysis +ratiometric.binom.false.pos <- function(my.alpha, my.beta, + N, mu, L=1500, + num.genes=18500, + signif.level=5e-6){ + # calculate the ratio-metric fraction from alpha and beta + p <- my.alpha / (my.alpha + my.beta) + # examine power of binomial test + # first find critical value based on binomial distribution + power <- c() + falsePositives <- c() + for(i in N){ + # step one, find the # of mutations where + # it is expected to occur at least 90% of the time + #j <- 1 + #while(j){ + # prob <- pbinom(j-1, L*i, mu) + # if(prob >= .1){ + # mutEff <- j + # break + # } + # j <- j+1 + #} + mutEff <- ceiling(L*i*mu) + + # step one, find critical threshold + j <- 1 + while(j){ + pval <- 1-pbinom(j-1, mutEff, p) + if(pval <= signif.level){ + Xc <- j + break + } + j <- j+1 + } + + # step two, calculate false positives if overdispersion + fp <- 1 - pbetabinom.ab(Xc-1, mutEff, my.alpha, my.beta) + falsePositives <- c(falsePositives, num.genes*fp) + } + return(falsePositives) +} + +############################ +# Calculate required samples size +############################ + +#' Calculates the smallest sample size to detect driver genes for which +#' there is sufficient power using a binomial model for ratio-metric features. +#' +#' Effect size is measures as the fraction of sample/patient cancers with a non-silent +#' mutation in a driver gene above the background mutation rate. +#' +#' @param p the background fraction of total mutations represented by the ratio-metric feature (e.g. inactivating mutations / total) +#' @param desired.power A floating point number indicating desired power +#' @param possible.samp.sizes vector of possible number of cancer samples in study +#' @param mu mutation rate per base +#' @param effect.size fraction of samples above background mutation rate +#' @param signif.level significance level for binomial test +#' @param L gene length of CDS in bases for an average gene +#' @return List containing the smallest effect size with sufficient power +ratiometricBinomRequiredSampleSize <- function(p, desired.power, possible.samp.sizes, mu, + effect.size, signif.lvl=5e-6, L=1500){ + # calculate power + power.result.ratio <- ratiometric.binom.power(p, possible.samp.sizes, mu, L, + signif.level=signif.lvl, + r=effect.size) + ratiometric.samp.size.min <- possible.samp.sizes[min(which(power.result.ratio>=desired.power))] + ratiometric.samp.size.max <- possible.samp.sizes[max(which(power.result.ratio=desired.power))] + ratiometric.samp.size.max <- possible.samp.sizes[max(which(power.result.ratio=desired.power))] + binom.eff.size.max <- possible.effect.sizes[max(which(pow.vec=desired.power))] + bbd.eff.size.max <- possible.effect.sizes[max(which(pow.vec=desired.power))] + bbd.samp.size.max <- possible.samp.sizes[max(which(power.result.bbd=desired.power))] + binom.samp.size.max <- possible.samp.sizes[max(which(power.result.binom=desired.power))] + bbd.eff.size.max <- possible.effect.sizes[max(which(pow.vec=desired.power))] + binom.eff.size.max <- possible.effect.sizes[max(which(pow.vec