diff --git a/R/cancerSeqStudy.R b/R/cancerSeqStudy.R index 0231a15..d154550 100644 --- a/R/cancerSeqStudy.R +++ b/R/cancerSeqStudy.R @@ -6,6 +6,7 @@ if ("getopt" %in% rownames(installed.packages())){ 'mcores', 'c', 1, 'integer', 'output', 'o', 1, 'character', 'ratioMetric', 'r', 2, 'double', + 'driverFrac', 'd', 2, 'double', 'help', 'h', 0, 'logical' ), byrow=TRUE, ncol=4) opt = getopt(spec) @@ -24,7 +25,6 @@ suppressPackageStartupMessages(library(VGAM)) suppressPackageStartupMessages(library(reshape2)) suppressPackageStartupMessages(library(parallel)) - ############################# # Convert a rate and coefficient # of variation parameter into @@ -44,7 +44,6 @@ rateCvToAlphaBeta <- function(rate, cv) { return(list(alpha=my.alpha, beta=my.beta)) } - ############################# # Analyze power and false positives # when using a beta-binomial model @@ -54,7 +53,7 @@ smgBbdFullAnalysis <- function(mu, cv, Leff, signif.level, effect.size, # find the power and numer of samples needed for a desired power powerResult <- smgBbdRequiredSampleSize(desired.power, mu, cv, samp.sizes, - effect.size, signif.level, Leff) + effect.size, signif.level, Leff) bbd.samp.size.min <- powerResult$samp.size.min bbd.samp.size.max <- powerResult$samp.size.max power.result.bbd <- powerResult$power @@ -82,12 +81,12 @@ smgBbdFullAnalysis <- function(mu, cv, Leff, signif.level, effect.size, } -ratiometricBbdFullAnalysis <- function(p, cv, mu, L, signif.level, effect.size, - desired.power, samp.sizes){ +ratiometricBbdFullAnalysis <- function(p, cv, mu, Leff, signif.level, effect.size, + desired.power, samp.sizes, Df){ # find the power and numer of samples needed for a desired power powerResult <- ratiometricBbdRequiredSampleSize(p, cv, desired.power, samp.sizes, mu, - effect.size, signif.level, L) + effect.size, Df, signif.level, Leff) bbd.samp.size.min <- powerResult$samp.size.min bbd.samp.size.max <- powerResult$samp.size.max power.result.bbd <- powerResult$power @@ -98,7 +97,7 @@ ratiometricBbdFullAnalysis <- function(p, cv, mu, L, signif.level, effect.size, # find expected number of false positives fp.result <- ratiometric.binom.false.pos(params$alpha, params$beta, samp.sizes, - mu, L, signif.level=signif.level) + mu, Leff, signif.level=signif.level) # save binomial data tmp.df <- data.frame(sample.size=samp.sizes) @@ -109,7 +108,8 @@ ratiometricBbdFullAnalysis <- function(p, cv, mu, L, signif.level, effect.size, tmp.df['signif.level'] <- signif.level tmp.df['effect.size'] <- effect.size tmp.df['mutation.rate'] <- mu - tmp.df["Ratio-metric.P"] <- p + tmp.df["Ratio-metric P"] <- p + tmp.df["Driver fraction"] <- Df tmp.df["FP"] <- fp.result return(tmp.df) @@ -142,11 +142,11 @@ smgBinomFullAnalysis <- function(mu, Leff, signif.level, effect.size, return(tmp.df) } -ratiometricBinomFullAnalysis <- function(p, mu, L, signif.level, effect.size, - desired.power, samp.sizes){ +ratiometricBinomFullAnalysis <- function(p, mu, Leff, signif.level, effect.size, + desired.power, samp.sizes, Df){ # calculate power - power.result.binom <- ratiometric.binom.power(p, samp.sizes, mu, L, - signif.level=signif.level, + power.result.binom <- ratiometric.binom.power(p, samp.sizes, mu, Leff=Leff, + Df=Df, signif.level=signif.level, r=effect.size) binom.samp.size.min <- samp.sizes[min(which(power.result.binom>=desired.power))] binom.samp.size.max <- samp.sizes[max(which(power.result.binom 0) { + # Rscript + return(normalizePath(sub(needle, "", cmdArgs[match]))) + } else { + ls_vars = ls(sys.frames()[[1]]) + if ("fileName" %in% ls_vars) { + # Source'd via RStudio + return(normalizePath(sys.frames()[[1]]$fileName)) + } else { + # Source'd via R console + return(normalizePath(sys.frames()[[1]]$ofile)) + } + } +} + # Run as a script if arguments provided if (!is.null(opt$ARGS)){ # load other R files - source("smg.R") - source("ratioMetric.R") + script.path <- thisfile() + script.dir <- dirname(script.path) + source(file.path(script.dir, "smg.R")) + source(file.path(script.dir, "ratioMetric.R")) ############################# # define the model params @@ -285,10 +310,14 @@ if (!is.null(opt$ARGS)){ possible.cvs <- c(.05, .1, .2) # coefficient of variation for mutation rate per base effect.sizes <- c(.01, .02, .05) # fraction of samples above background alpha.levels <- c(5e-6) # list for level of significance + + # fraction of driver mutations being a certain + # category of interest for ratio-metric methods + driver.frac <- opt$driverFrac # setting up the sample sizes to check N <- 25000 - by.step <- 25 + by.step <- 10 samp.sizes <- seq(by.step, N, by=by.step) # grid of sample sizes to check ################################## @@ -316,8 +345,8 @@ if (!is.null(opt$ARGS)){ ############################ result.list <- mclapply(param.list, runAnalysisList, mc.cores=opt$mcores, analysisType=cmdType, samp.sizes=samp.sizes, - desired.power=desired.power, - Leff=Leff, L=L, possible.cvs=possible.cvs) + desired.power=desired.power, Df=driver.frac, + Leff=Leff, possible.cvs=possible.cvs) result.df <- do.call("rbind", result.list) # adjust mutation rates back to the average diff --git a/R/ratioMetric.R b/R/ratioMetric.R index 0f604e7..7c873ed 100644 --- a/R/ratioMetric.R +++ b/R/ratioMetric.R @@ -1,5 +1,4 @@ suppressPackageStartupMessages(library(VGAM)) -suppressPackageStartupMessages(library(reshape2)) ################################## # Power calculatations @@ -11,20 +10,22 @@ suppressPackageStartupMessages(library(reshape2)) #' @param p background proportion of total mutations falling into specific category #' @param N vector of sample sizes #' @param mu per base rate of mutation +#' @param Df fraction of driver mutations that are the specific one of interest +#' @param Leff gene length in bases #' @param r effect size for power analysis #' @param signif.level alpha level for power analysis #' @return vector containing power for each sample size -ratiometric.binom.power <- function(p, N, mu, - L=1500, r=.02, +ratiometric.binom.power <- function(p, N, mu, + Df=1.0, Leff=1500*3/4, r=.02, signif.level=5e-6){ # figure out the target mutation rate for effect size is - muEffect <- 1 - ((1-mu)^(L) - r)^(1/L) + muEffect <- 1 - ((1-mu)^(Leff) - r)^(1/Leff) # Calculate the discrepancy between the background and # target effect size muDiff <- muEffect - mu # given the mutation rates calculate the target effect # size for a ratio-metric method - pEffect <- (mu*p + muDiff) / muEffect + pEffect <- (mu*p + Df*muDiff) / muEffect # iterate over the number of samples power <- c() @@ -33,7 +34,7 @@ ratiometric.binom.power <- function(p, N, mu, # it is expected to occur at least 90% of the time j <- 1 while(j){ - prob <- pbinom(j-1, L*i, muEffect) + prob <- pbinom(j-1, Leff*i, muEffect) if(prob >= .1){ mutEff <- j break @@ -69,24 +70,25 @@ ratiometric.binom.power <- function(p, N, mu, #' @param my.beta beta parameter for beta binomial #' @param N vector of sample sizes #' @param mu per base rate of mutation +#' @param Leff length of gene in bases #' @param r effect size for power analysis #' @param signif.level alpha level for power analysis #' @return vector containing power for each sample size ratiometric.bbd.power <- function(my.alpha, my.beta, - N, mu, - L=1500, r=.02, + N, mu, Df=1.0, + Leff=1500*3/4, r=.02, signif.level=5e-6){ # figure out what the ratio-metric probability is from # the alpha and beta parameters p <- my.alpha / (my.alpha + my.beta) # figure out the target mutation rate for effect size is - muEffect <- 1 - ((1-mu)^(L) - r)^(1/L) + muEffect <- 1 - ((1-mu)^(Leff) - r)^(1/Leff) # Calculate the discrepancy between the background and # target effect size muDiff <- muEffect - mu # given the mutation rates calculate the target effect # size for a ratio-metric method - pEffect <- (mu*p + muDiff) / muEffect + pEffect <- (mu*p + Df*muDiff) / muEffect # iterate over the number of samples power <- c() @@ -95,7 +97,7 @@ ratiometric.bbd.power <- function(my.alpha, my.beta, # it is expected to occur at least 90% of the time j <- 1 while(j){ - prob <- pbinom(j-1, L*i, muEffect) + prob <- pbinom(j-1, Leff*i, muEffect) if(prob >= .1){ mutEff <- j break @@ -136,7 +138,7 @@ ratiometric.bbd.power <- function(my.alpha, my.beta, #' @param num.genes number of genes that are tested #' @param signif.level alpha level for power analysis ratiometric.binom.false.pos <- function(my.alpha, my.beta, - N, mu, L=1500, + N, mu, Leff=1500*3/4, num.genes=18500, signif.level=5e-6){ # calculate the ratio-metric fraction from alpha and beta @@ -157,7 +159,7 @@ ratiometric.binom.false.pos <- function(my.alpha, my.beta, # } # j <- j+1 #} - mutEff <- ceiling(L*i*mu) + mutEff <- ceiling(Leff*i*mu) # step one, find critical threshold j <- 1 @@ -196,10 +198,10 @@ ratiometric.binom.false.pos <- function(my.alpha, my.beta, #' @param L gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBinomRequiredSampleSize <- function(p, desired.power, possible.samp.sizes, mu, - effect.size, signif.lvl=5e-6, L=1500){ + effect.size, Df=1.0, signif.lvl=5e-6, Leff=1500*3/4){ # calculate power - power.result.ratio <- ratiometric.binom.power(p, possible.samp.sizes, mu, L, - signif.level=signif.lvl, + power.result.ratio <- ratiometric.binom.power(p, possible.samp.sizes, mu, Leff, + Df=Df, signif.level=signif.lvl, r=effect.size) ratiometric.samp.size.min <- possible.samp.sizes[min(which(power.result.ratio>=desired.power))] ratiometric.samp.size.max <- possible.samp.sizes[max(which(power.result.ratio=desired.power))] @@ -266,12 +268,12 @@ ratiometricBbdRequiredSampleSize <- function(p, cv, desired.power, possible.samp #' @param Leff effective gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBinomPoweredEffectSize <- function(possible.effect.sizes, desired.power, p, mu, - samp.size, signif.level=5e-6, L=1500) { + samp.size, Df=1.0, signif.level=5e-6, Leff=1500*3/4) { # calculate the power for each effect size pow.vec <- c() for(effect.size in possible.effect.sizes){ - pow <- ratiometric.binom.power(p, samp.size, mu, L, - signif.level=signif.level, + pow <- ratiometric.binom.power(p, samp.size, mu, Leff, + Df=Df, signif.level=signif.level, r=effect.size) pow.vec <- c(pow.vec, pow) } @@ -302,7 +304,7 @@ ratiometricBinomPoweredEffectSize <- function(possible.effect.sizes, desired.pow #' @param Leff effective gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBbdPoweredEffectSize <- function(possible.effect.sizes, desired.power, p, cv, mu, - samp.size, signif.level=5e-6, L=1500) { + samp.size, Df=1.0, signif.level=5e-6, Leff=1500*3/4) { # figure out alpha/beta for beta-binomial params <- rateCvToAlphaBeta(p, cv) @@ -310,8 +312,8 @@ ratiometricBbdPoweredEffectSize <- function(possible.effect.sizes, desired.power pow.vec <- c() for(effect.size in possible.effect.sizes){ pow <- ratiometric.bbd.power(params$alpha, params$beta, - samp.size, mu, L, - signif.level=signif.level, + samp.size, mu, Leff, + Df=Df, signif.level=signif.level, r=effect.size) pow.vec <- c(pow.vec, pow) } diff --git a/R/smg.R b/R/smg.R index 063499b..35c3793 100644 --- a/R/smg.R +++ b/R/smg.R @@ -1,5 +1,4 @@ suppressPackageStartupMessages(library(VGAM)) -suppressPackageStartupMessages(library(reshape2)) ################################## # Power calculatations diff --git a/README.md b/README.md index 5ae8797..beb56e9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # cancerSeqStudy -Identifying genes with more mutations then expected has been central methodology for identifying putative cancer driver genes in exome sequencing studies of cancer samples. Identifying significantly mutated genes (SMG) fundamentally relies on estimating a background mutation rate. Mutation rate varies over more than 2 orders of magnitude providing a substantial statistical estimation challenge. Analysis not accounting for the uncertainty in mutation rate yields overly optimistic assessments. In this package, we examine statistical power (either with known or uncertain mutation rate) and false positives induced by unaccounted variation in mutation rate. +Identifying genes with more mutations then expected has been central methodology for identifying putative cancer driver genes in exome sequencing studies of cancer samples. Identifying significantly mutated genes (SMG) fundamentally relies on estimating a background mutation rate. Mutation rate varies over more than 2 orders of magnitude providing a substantial statistical estimation challenge. However, recent methods have taken an alternative approach known as "ratio-metric" method. Ratio-metric methods examine specific compositions of mutations normalized by the total number of mutations occurring in the gene. Regardless of methodology, analysis not accounting for the uncertainty in mutation parameters yields overly optimistic assessments. In this package, we examine statistical power (either with known or uncertain mutation rate) and false positives induced by unaccounted variation in mutation rate. ## Documentation