From 111d744937e65e42ab867b74e629e27421e9cf81 Mon Sep 17 00:00:00 2001 From: ctokheim Date: Sat, 14 May 2016 15:54:11 -0400 Subject: [PATCH 1/6] Fixed running from Rscript by appropriate sourcing of other files in package --- R/cancerSeqStudy.R | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/R/cancerSeqStudy.R b/R/cancerSeqStudy.R index 0231a15..ce5e4e4 100644 --- a/R/cancerSeqStudy.R +++ b/R/cancerSeqStudy.R @@ -24,7 +24,6 @@ suppressPackageStartupMessages(library(VGAM)) suppressPackageStartupMessages(library(reshape2)) suppressPackageStartupMessages(library(parallel)) - ############################# # Convert a rate and coefficient # of variation parameter into @@ -255,11 +254,34 @@ runRatiometricAnalysis <- function(p, mu, effect.size, signif.level, return(result.df) } +#' Utility function to get location of script when +#' executing Rscript. +thisfile <- function() { + cmdArgs = commandArgs(trailingOnly = FALSE) + needle = "--file=" + match = grep(needle, cmdArgs) + if (length(match) > 0) { + # Rscript + return(normalizePath(sub(needle, "", cmdArgs[match]))) + } else { + ls_vars = ls(sys.frames()[[1]]) + if ("fileName" %in% ls_vars) { + # Source'd via RStudio + return(normalizePath(sys.frames()[[1]]$fileName)) + } else { + # Source'd via R console + return(normalizePath(sys.frames()[[1]]$ofile)) + } + } +} + # Run as a script if arguments provided if (!is.null(opt$ARGS)){ # load other R files - source("smg.R") - source("ratioMetric.R") + script.path <- thisfile() + script.dir <- dirname(script.path) + source(file.path(script.dir, "smg.R")) + source(file.path(script.dir, "ratioMetric.R")) ############################# # define the model params From 3cd89760bd068f301edad08893c436eac63cc044 Mon Sep 17 00:00:00 2001 From: ctokheim Date: Sat, 14 May 2016 16:41:47 -0400 Subject: [PATCH 2/6] Got rid of unneccesary import --- R/smg.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/smg.R b/R/smg.R index 063499b..35c3793 100644 --- a/R/smg.R +++ b/R/smg.R @@ -1,5 +1,4 @@ suppressPackageStartupMessages(library(VGAM)) -suppressPackageStartupMessages(library(reshape2)) ################################## # Power calculatations From b93764cb0676959b00d39277c21a996ad6d267ca Mon Sep 17 00:00:00 2001 From: ctokheim Date: Sat, 14 May 2016 16:58:54 -0400 Subject: [PATCH 3/6] Added parameter for fraction of driver mutations which fit the ratio-metric feature of interest --- R/cancerSeqStudy.R | 41 ++++++++++++++++++++++++----------------- R/ratioMetric.R | 28 ++++++++++++++-------------- 2 files changed, 38 insertions(+), 31 deletions(-) diff --git a/R/cancerSeqStudy.R b/R/cancerSeqStudy.R index ce5e4e4..98d4f23 100644 --- a/R/cancerSeqStudy.R +++ b/R/cancerSeqStudy.R @@ -6,6 +6,7 @@ if ("getopt" %in% rownames(installed.packages())){ 'mcores', 'c', 1, 'integer', 'output', 'o', 1, 'character', 'ratioMetric', 'r', 2, 'double', + 'driverFrac', 'd', 2, 'double', 'help', 'h', 0, 'logical' ), byrow=TRUE, ncol=4) opt = getopt(spec) @@ -43,7 +44,6 @@ rateCvToAlphaBeta <- function(rate, cv) { return(list(alpha=my.alpha, beta=my.beta)) } - ############################# # Analyze power and false positives # when using a beta-binomial model @@ -53,7 +53,7 @@ smgBbdFullAnalysis <- function(mu, cv, Leff, signif.level, effect.size, # find the power and numer of samples needed for a desired power powerResult <- smgBbdRequiredSampleSize(desired.power, mu, cv, samp.sizes, - effect.size, signif.level, Leff) + effect.size, signif.level, Leff) bbd.samp.size.min <- powerResult$samp.size.min bbd.samp.size.max <- powerResult$samp.size.max power.result.bbd <- powerResult$power @@ -82,11 +82,11 @@ smgBbdFullAnalysis <- function(mu, cv, Leff, signif.level, effect.size, ratiometricBbdFullAnalysis <- function(p, cv, mu, L, signif.level, effect.size, - desired.power, samp.sizes){ + desired.power, samp.sizes, Df){ # find the power and numer of samples needed for a desired power powerResult <- ratiometricBbdRequiredSampleSize(p, cv, desired.power, samp.sizes, mu, - effect.size, signif.level, L) + effect.size, Df, signif.level, L) bbd.samp.size.min <- powerResult$samp.size.min bbd.samp.size.max <- powerResult$samp.size.max power.result.bbd <- powerResult$power @@ -108,7 +108,8 @@ ratiometricBbdFullAnalysis <- function(p, cv, mu, L, signif.level, effect.size, tmp.df['signif.level'] <- signif.level tmp.df['effect.size'] <- effect.size tmp.df['mutation.rate'] <- mu - tmp.df["Ratio-metric.P"] <- p + tmp.df["Ratio-metric P"] <- p + tmp.df["Driver fraction"] <- Df tmp.df["FP"] <- fp.result return(tmp.df) @@ -142,10 +143,10 @@ smgBinomFullAnalysis <- function(mu, Leff, signif.level, effect.size, } ratiometricBinomFullAnalysis <- function(p, mu, L, signif.level, effect.size, - desired.power, samp.sizes){ + desired.power, samp.sizes, Df){ # calculate power power.result.binom <- ratiometric.binom.power(p, samp.sizes, mu, L, - signif.level=signif.level, + Df=Df, signif.level=signif.level, r=effect.size) binom.samp.size.min <- samp.sizes[min(which(power.result.binom>=desired.power))] binom.samp.size.max <- samp.sizes[max(which(power.result.binom=desired.power))] ratiometric.samp.size.max <- possible.samp.sizes[max(which(power.result.ratio=desired.power))] @@ -266,12 +266,12 @@ ratiometricBbdRequiredSampleSize <- function(p, cv, desired.power, possible.samp #' @param Leff effective gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBinomPoweredEffectSize <- function(possible.effect.sizes, desired.power, p, mu, - samp.size, signif.level=5e-6, L=1500) { + samp.size, Df=1.0, signif.level=5e-6, L=1500) { # calculate the power for each effect size pow.vec <- c() for(effect.size in possible.effect.sizes){ pow <- ratiometric.binom.power(p, samp.size, mu, L, - signif.level=signif.level, + Df=Df, signif.level=signif.level, r=effect.size) pow.vec <- c(pow.vec, pow) } @@ -302,7 +302,7 @@ ratiometricBinomPoweredEffectSize <- function(possible.effect.sizes, desired.pow #' @param Leff effective gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBbdPoweredEffectSize <- function(possible.effect.sizes, desired.power, p, cv, mu, - samp.size, signif.level=5e-6, L=1500) { + samp.size, Df=1.0, signif.level=5e-6, L=1500) { # figure out alpha/beta for beta-binomial params <- rateCvToAlphaBeta(p, cv) @@ -311,7 +311,7 @@ ratiometricBbdPoweredEffectSize <- function(possible.effect.sizes, desired.power for(effect.size in possible.effect.sizes){ pow <- ratiometric.bbd.power(params$alpha, params$beta, samp.size, mu, L, - signif.level=signif.level, + Df=Df, signif.level=signif.level, r=effect.size) pow.vec <- c(pow.vec, pow) } From 9b88b39d7f6c3a8f12c3c71cbf2e137b26905c7d Mon Sep 17 00:00:00 2001 From: ctokheim Date: Mon, 16 May 2016 14:47:22 -0400 Subject: [PATCH 4/6] Fix in parameter passing --- R/cancerSeqStudy.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/cancerSeqStudy.R b/R/cancerSeqStudy.R index 98d4f23..63ef33b 100644 --- a/R/cancerSeqStudy.R +++ b/R/cancerSeqStudy.R @@ -145,7 +145,7 @@ smgBinomFullAnalysis <- function(mu, Leff, signif.level, effect.size, ratiometricBinomFullAnalysis <- function(p, mu, L, signif.level, effect.size, desired.power, samp.sizes, Df){ # calculate power - power.result.binom <- ratiometric.binom.power(p, samp.sizes, mu, L, + power.result.binom <- ratiometric.binom.power(p, samp.sizes, mu, L=L, Df=Df, signif.level=signif.level, r=effect.size) binom.samp.size.min <- samp.sizes[min(which(power.result.binom>=desired.power))] @@ -317,7 +317,7 @@ if (!is.null(opt$ARGS)){ # setting up the sample sizes to check N <- 25000 - by.step <- 25 + by.step <- 10 samp.sizes <- seq(by.step, N, by=by.step) # grid of sample sizes to check ################################## From 1be5ae73c4026bfa12a0f68a91ecf761f3e29bda Mon Sep 17 00:00:00 2001 From: ctokheim Date: Tue, 17 May 2016 09:55:15 -0400 Subject: [PATCH 5/6] Minor change --- R/cancerSeqStudy.R | 26 +++++++++++++------------- R/ratioMetric.R | 36 +++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/R/cancerSeqStudy.R b/R/cancerSeqStudy.R index 63ef33b..d154550 100644 --- a/R/cancerSeqStudy.R +++ b/R/cancerSeqStudy.R @@ -81,12 +81,12 @@ smgBbdFullAnalysis <- function(mu, cv, Leff, signif.level, effect.size, } -ratiometricBbdFullAnalysis <- function(p, cv, mu, L, signif.level, effect.size, +ratiometricBbdFullAnalysis <- function(p, cv, mu, Leff, signif.level, effect.size, desired.power, samp.sizes, Df){ # find the power and numer of samples needed for a desired power powerResult <- ratiometricBbdRequiredSampleSize(p, cv, desired.power, samp.sizes, mu, - effect.size, Df, signif.level, L) + effect.size, Df, signif.level, Leff) bbd.samp.size.min <- powerResult$samp.size.min bbd.samp.size.max <- powerResult$samp.size.max power.result.bbd <- powerResult$power @@ -97,7 +97,7 @@ ratiometricBbdFullAnalysis <- function(p, cv, mu, L, signif.level, effect.size, # find expected number of false positives fp.result <- ratiometric.binom.false.pos(params$alpha, params$beta, samp.sizes, - mu, L, signif.level=signif.level) + mu, Leff, signif.level=signif.level) # save binomial data tmp.df <- data.frame(sample.size=samp.sizes) @@ -142,10 +142,10 @@ smgBinomFullAnalysis <- function(mu, Leff, signif.level, effect.size, return(tmp.df) } -ratiometricBinomFullAnalysis <- function(p, mu, L, signif.level, effect.size, +ratiometricBinomFullAnalysis <- function(p, mu, Leff, signif.level, effect.size, desired.power, samp.sizes, Df){ # calculate power - power.result.binom <- ratiometric.binom.power(p, samp.sizes, mu, L=L, + power.result.binom <- ratiometric.binom.power(p, samp.sizes, mu, Leff=Leff, Df=Df, signif.level=signif.level, r=effect.size) binom.samp.size.min <- samp.sizes[min(which(power.result.binom>=desired.power))] @@ -187,7 +187,7 @@ runSmgAnalysisList <- function(x, samp.sizes, } runRatiometricAnalysisList <- function(x, samp.sizes, - Df=1.0, desired.power=.9, L=1500, + Df=1.0, desired.power=.9, Leff=1500*3/4, possible.cvs=c()){ # unpack the parameters myp <- x[1] @@ -197,7 +197,7 @@ runRatiometricAnalysisList <- function(x, samp.sizes, # run analysis result.df <- runRatiometricAnalysis(myp, mymu, myeffect.size, myalpha.level, - samp.sizes, desired.power, L, Df, possible.cvs) + samp.sizes, desired.power, Leff, Df, possible.cvs) return(result.df) } @@ -206,11 +206,11 @@ runRatiometricAnalysisList <- function(x, samp.sizes, #' rate, effect.size, and significance level. The purpose of this function is parallelized #' code running over a list of parameters. If you are not parallelizing, then use the #' runAnalysis function. -runAnalysisList <- function(x, analysisType="smg", Leff=1500*3/4, L=1500, Df=1.0, ...){ +runAnalysisList <- function(x, analysisType="smg", Leff=1500*3/4, Df=1.0, ...){ if (analysisType=="smg"){ result <- runSmgAnalysisList(x, Leff=Leff, ...) } else{ - result <- runRatiometricAnalysisList(x, L=L, Df=Df, ...) + result <- runRatiometricAnalysisList(x, Leff=Leff, Df=Df, ...) } return(result) } @@ -238,18 +238,18 @@ runSmgAnalysis <- function(mu, effect.size, signif.level, #' Runs the entire power and false positive analysis pipeline. runRatiometricAnalysis <- function(p, mu, effect.size, signif.level, samp.sizes, desired.power=.9, - L=1500, Df=1.0, possible.cvs=c()){ + Leff=1500*3/4, Df=1.0, possible.cvs=c()){ # run beta-binomial model result.df <- data.frame() for (mycv in possible.cvs){ # calculate false positives and power - tmp.df <- ratiometricBbdFullAnalysis(p, mycv, mu, L, signif.level, effect.size, + tmp.df <- ratiometricBbdFullAnalysis(p, mycv, mu, Leff, signif.level, effect.size, desired.power, samp.sizes, Df) result.df <- rbind(result.df, tmp.df) } # save binomial data - tmp.df <- ratiometricBinomFullAnalysis(p, mu, L, signif.level, + tmp.df <- ratiometricBinomFullAnalysis(p, mu, Leff, signif.level, effect.size, desired.power, samp.sizes, Df) result.df <- rbind(result.df, tmp.df) @@ -346,7 +346,7 @@ if (!is.null(opt$ARGS)){ result.list <- mclapply(param.list, runAnalysisList, mc.cores=opt$mcores, analysisType=cmdType, samp.sizes=samp.sizes, desired.power=desired.power, Df=driver.frac, - Leff=Leff, L=L, possible.cvs=possible.cvs) + Leff=Leff, possible.cvs=possible.cvs) result.df <- do.call("rbind", result.list) # adjust mutation rates back to the average diff --git a/R/ratioMetric.R b/R/ratioMetric.R index 2c05964..7c873ed 100644 --- a/R/ratioMetric.R +++ b/R/ratioMetric.R @@ -10,14 +10,16 @@ suppressPackageStartupMessages(library(VGAM)) #' @param p background proportion of total mutations falling into specific category #' @param N vector of sample sizes #' @param mu per base rate of mutation +#' @param Df fraction of driver mutations that are the specific one of interest +#' @param Leff gene length in bases #' @param r effect size for power analysis #' @param signif.level alpha level for power analysis #' @return vector containing power for each sample size ratiometric.binom.power <- function(p, N, mu, - Df=1.0, L=1500, r=.02, + Df=1.0, Leff=1500*3/4, r=.02, signif.level=5e-6){ # figure out the target mutation rate for effect size is - muEffect <- 1 - ((1-mu)^(L) - r)^(1/L) + muEffect <- 1 - ((1-mu)^(Leff) - r)^(1/Leff) # Calculate the discrepancy between the background and # target effect size muDiff <- muEffect - mu @@ -32,7 +34,7 @@ ratiometric.binom.power <- function(p, N, mu, # it is expected to occur at least 90% of the time j <- 1 while(j){ - prob <- pbinom(j-1, L*i, muEffect) + prob <- pbinom(j-1, Leff*i, muEffect) if(prob >= .1){ mutEff <- j break @@ -68,19 +70,19 @@ ratiometric.binom.power <- function(p, N, mu, #' @param my.beta beta parameter for beta binomial #' @param N vector of sample sizes #' @param mu per base rate of mutation -#' @param L length of gene in bases +#' @param Leff length of gene in bases #' @param r effect size for power analysis #' @param signif.level alpha level for power analysis #' @return vector containing power for each sample size ratiometric.bbd.power <- function(my.alpha, my.beta, N, mu, Df=1.0, - L=1500, r=.02, + Leff=1500*3/4, r=.02, signif.level=5e-6){ # figure out what the ratio-metric probability is from # the alpha and beta parameters p <- my.alpha / (my.alpha + my.beta) # figure out the target mutation rate for effect size is - muEffect <- 1 - ((1-mu)^(L) - r)^(1/L) + muEffect <- 1 - ((1-mu)^(Leff) - r)^(1/Leff) # Calculate the discrepancy between the background and # target effect size muDiff <- muEffect - mu @@ -95,7 +97,7 @@ ratiometric.bbd.power <- function(my.alpha, my.beta, # it is expected to occur at least 90% of the time j <- 1 while(j){ - prob <- pbinom(j-1, L*i, muEffect) + prob <- pbinom(j-1, Leff*i, muEffect) if(prob >= .1){ mutEff <- j break @@ -136,7 +138,7 @@ ratiometric.bbd.power <- function(my.alpha, my.beta, #' @param num.genes number of genes that are tested #' @param signif.level alpha level for power analysis ratiometric.binom.false.pos <- function(my.alpha, my.beta, - N, mu, L=1500, + N, mu, Leff=1500*3/4, num.genes=18500, signif.level=5e-6){ # calculate the ratio-metric fraction from alpha and beta @@ -157,7 +159,7 @@ ratiometric.binom.false.pos <- function(my.alpha, my.beta, # } # j <- j+1 #} - mutEff <- ceiling(L*i*mu) + mutEff <- ceiling(Leff*i*mu) # step one, find critical threshold j <- 1 @@ -196,9 +198,9 @@ ratiometric.binom.false.pos <- function(my.alpha, my.beta, #' @param L gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBinomRequiredSampleSize <- function(p, desired.power, possible.samp.sizes, mu, - effect.size, Df=1.0, signif.lvl=5e-6, L=1500){ + effect.size, Df=1.0, signif.lvl=5e-6, Leff=1500*3/4){ # calculate power - power.result.ratio <- ratiometric.binom.power(p, possible.samp.sizes, mu, L, + power.result.ratio <- ratiometric.binom.power(p, possible.samp.sizes, mu, Leff, Df=Df, signif.level=signif.lvl, r=effect.size) ratiometric.samp.size.min <- possible.samp.sizes[min(which(power.result.ratio>=desired.power))] @@ -226,7 +228,7 @@ ratiometricBinomRequiredSampleSize <- function(p, desired.power, possible.samp.s #' @param L gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBbdRequiredSampleSize <- function(p, cv, desired.power, possible.samp.sizes, mu, - effect.size, Df=1.0, signif.lvl=5e-6, L=1500){ + effect.size, Df=1.0, signif.lvl=5e-6, Leff=1500*3/4){ # get alpha and beta parameterization # for beta-binomial params <- rateCvToAlphaBeta(p, cv) @@ -234,7 +236,7 @@ ratiometricBbdRequiredSampleSize <- function(p, cv, desired.power, possible.samp # calculate power power.result.ratio <- ratiometric.bbd.power(params$alpha, params$beta, possible.samp.sizes, - mu, L, Df=Df, + mu, Leff, Df=Df, signif.level=signif.lvl, r=effect.size) ratiometric.samp.size.min <- possible.samp.sizes[min(which(power.result.ratio>=desired.power))] @@ -266,11 +268,11 @@ ratiometricBbdRequiredSampleSize <- function(p, cv, desired.power, possible.samp #' @param Leff effective gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBinomPoweredEffectSize <- function(possible.effect.sizes, desired.power, p, mu, - samp.size, Df=1.0, signif.level=5e-6, L=1500) { + samp.size, Df=1.0, signif.level=5e-6, Leff=1500*3/4) { # calculate the power for each effect size pow.vec <- c() for(effect.size in possible.effect.sizes){ - pow <- ratiometric.binom.power(p, samp.size, mu, L, + pow <- ratiometric.binom.power(p, samp.size, mu, Leff, Df=Df, signif.level=signif.level, r=effect.size) pow.vec <- c(pow.vec, pow) @@ -302,7 +304,7 @@ ratiometricBinomPoweredEffectSize <- function(possible.effect.sizes, desired.pow #' @param Leff effective gene length of CDS in bases for an average gene #' @return List containing the smallest effect size with sufficient power ratiometricBbdPoweredEffectSize <- function(possible.effect.sizes, desired.power, p, cv, mu, - samp.size, Df=1.0, signif.level=5e-6, L=1500) { + samp.size, Df=1.0, signif.level=5e-6, Leff=1500*3/4) { # figure out alpha/beta for beta-binomial params <- rateCvToAlphaBeta(p, cv) @@ -310,7 +312,7 @@ ratiometricBbdPoweredEffectSize <- function(possible.effect.sizes, desired.power pow.vec <- c() for(effect.size in possible.effect.sizes){ pow <- ratiometric.bbd.power(params$alpha, params$beta, - samp.size, mu, L, + samp.size, mu, Leff, Df=Df, signif.level=signif.level, r=effect.size) pow.vec <- c(pow.vec, pow) From 1fc9f9d996d21b4af4e303151a47d6c10c7b8249 Mon Sep 17 00:00:00 2001 From: ctokheim Date: Tue, 17 May 2016 09:57:49 -0400 Subject: [PATCH 6/6] Updated README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ae8797..beb56e9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # cancerSeqStudy -Identifying genes with more mutations then expected has been central methodology for identifying putative cancer driver genes in exome sequencing studies of cancer samples. Identifying significantly mutated genes (SMG) fundamentally relies on estimating a background mutation rate. Mutation rate varies over more than 2 orders of magnitude providing a substantial statistical estimation challenge. Analysis not accounting for the uncertainty in mutation rate yields overly optimistic assessments. In this package, we examine statistical power (either with known or uncertain mutation rate) and false positives induced by unaccounted variation in mutation rate. +Identifying genes with more mutations then expected has been central methodology for identifying putative cancer driver genes in exome sequencing studies of cancer samples. Identifying significantly mutated genes (SMG) fundamentally relies on estimating a background mutation rate. Mutation rate varies over more than 2 orders of magnitude providing a substantial statistical estimation challenge. However, recent methods have taken an alternative approach known as "ratio-metric" method. Ratio-metric methods examine specific compositions of mutations normalized by the total number of mutations occurring in the gene. Regardless of methodology, analysis not accounting for the uncertainty in mutation parameters yields overly optimistic assessments. In this package, we examine statistical power (either with known or uncertain mutation rate) and false positives induced by unaccounted variation in mutation rate. ## Documentation