diff --git a/scripts/snakemake/README.md b/scripts/snakemake/README.md index f69d9f8..ffe4553 100644 --- a/scripts/snakemake/README.md +++ b/scripts/snakemake/README.md @@ -7,13 +7,13 @@ This workflow will run the TITAN a set of tumour-normal pairs, starting from the Gavin Ha Fred Hutchinson Cancer Research Center contact: or -Date: January 30, 2019 +Date: May 11, 2019 Website: [GavinHaLab.org](https://gavinhalab.org/) ## Requirements ### Software packages or libraries - R-3.5 - - TitanCNA (v1.15.0) + - TitanCNA (v1.15.0+) - TitanCNA imports: GenomicRanges, dplyr, data.table, doMC - [ichorCNA]() (v0.1.0) - HMMcopy @@ -118,8 +118,9 @@ Users can run the snakemake files individually. This can be helpful for testing ``` ### c. [TitanCNA.snakefile](TitanCNA.snakefile) i. Run the [TitanCNA](https://github.com/gavinha/TitanCNA) analysis and generates solutions for different ploidy initializations and each clonal cluster. - ii. Merge results with ichorCNA output generate by [ichorCNA.snakefile](ichorCNA.snakefile) and post-processes copy number results. + ii. Merge results with ichorCNA output generate by [ichorCNA.snakefile](ichorCNA.snakefile) and post-processes copy number results. In particular, it combines chrX results from ichorCNA for male samples. iii. Select optimal solution for each samples and copies these to a new folder. The parameters are compiled in a text file. + iv. Creates a new directory containing symbolic links to result files for optimal solutions of each sample. # Configuration and settings All settings for the workflow are contained in [config/config.yaml](config/config.yaml). The settings are organized by paths to scripts and reference files and then by each step in the workflow. @@ -137,6 +138,7 @@ readCounterScript: /path/to/readCounter ichorCNA_rscript: /path/to/ichorCNA.R pyCountScript: code/countPysam.py TitanCNA_rscript: ../R_scripts/titanCNA.R +TitanCNA_combineTitanIchorCNA: code/combineTITAN-ichor.R TitanCNA_selectSolutionRscript: ../R_scripts/selectSolution.R ``` See the [ichorCNA](https://github.com/broadinstitute/ichorCNA/blob/master/scripts/runIchorCNA.R) repo for the ichorCNA R script. diff --git a/scripts/snakemake/TitanCNA.snakefile b/scripts/snakemake/TitanCNA.snakefile index f711c00..9dd1474 100644 --- a/scripts/snakemake/TitanCNA.snakefile +++ b/scripts/snakemake/TitanCNA.snakefile @@ -12,15 +12,10 @@ PLOIDY = {2:[2], 3:[2,3], 4:[2,3,4]} rule all: input: expand("results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.txt", tumor=config["pairings"], clustNum=CLUST[config["TitanCNA_maxNumClonalClusters"]], ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]), - #expand("results/titan/hmm/titanCNA_ploidy{ploidy}/", ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]), - "results/titan/hmm/optimalClusterSolution.txt" - -#rule makeOutDir: -# output: -# "results/titan/hmm/titanCNA_ploidy{ploidy}/" -# params: -# shell: -# "mkdir -p {output}" + expand("results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.seg.txt", tumor=config["pairings"], clustNum=CLUST[config["TitanCNA_maxNumClonalClusters"]], ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]), + expand("results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.cna.txt", tumor=config["pairings"], clustNum=CLUST[config["TitanCNA_maxNumClonalClusters"]], ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]), + "results/titan/hmm/optimalClusterSolution.txt", + "results/titan/hmm/optimalClusterSolution/" rule runTitanCNA: input: @@ -56,9 +51,27 @@ rule runTitanCNA: shell: "Rscript {params.titanRscript} --hetFile {input.alleleCounts} --cnFile {input.corrDepth} --outFile {output.titan} --outSeg {output.segTxt} --outParam {output.param} --outIGV {output.seg} --outPlotDir {params.outRoot} --libdir {params.libdir} --id {wildcards.tumor} --numClusters {wildcards.clustNum} --numCores {params.numCores} --normal_0 {params.normal} --ploidy_0 {wildcards.ploidy} --genomeStyle {params.genomeStyle} --genomeBuild {params.genomeBuild} --cytobandFile {params.cytobandFile} --chrs \"{params.chrs}\" --gender {params.sex} --estimateNormal {params.estimateNormal} --estimatePloidy {params.estimatePloidy} --estimateClonality {params.estimateClonality} --centromere {params.centromere} --alphaK {params.alphaK} --txnExpLen {params.txnExpLen} --plotYlim \"{params.plotYlim}\" > {log} 2> {log}" -#--alleleModel {params.alleleModel} --alphaR {params.alphaR} +rule combineTitanAndIchorCNA: + input: + titanSeg="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.segs.txt", + titanBin="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.txt", + titanParam="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.params.txt", + ichorSeg="results/ichorCNA/{tumor}/{tumor}.seg.txt", + ichorBin="results/ichorCNA/{tumor}/{tumor}.cna.seg", + ichorParam="results/ichorCNA/{tumor}/{tumor}.params.txt" + output: + segFile="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.seg.txt", + binFile="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.cna.txt", + params: + combineScript=config["TitanCNA_combineTitanIchorCNA"], + libdir=config["TitanCNA_libdir"], + centromere=config["centromere"], + sex=config["sex"] + log: + "logs/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.combineTitanIchorCNA.log" + shell: + "Rscript {params.combineScript} --libdir {params.libdir} --titanSeg {input.titanSeg} --titanBin {input.titanBin} --titanParam {input.titanParam} --ichorSeg {input.ichorSeg} --ichorBin {input.ichorBin} --ichorParam {input.ichorParam} --sex {params.sex} --outSegFile {output.segFile} --outBinFile {output.binFile} --centromere {params.centromere} > {log} 2> {log}" - rule selectSolution: input: #ploidyDirs=expand("results/titan/hmm/titanCNA_ploidy{ploidy}/", ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]), @@ -85,4 +98,23 @@ rule selectSolution: fi Rscript {params.solutionRscript} --ploidyRun2 $ploidyRun2 --ploidyRun3 $ploidyRun3 --ploidyRun4 $ploidyRun4 --threshold {params.threshold} --outFile {output} > {log} 2> {log} """ - + +rule copyOptSolution: + input: + "results/titan/hmm/optimalClusterSolution.txt" + output: + directory("results/titan/hmm/optimalClusterSolution/") + params: + log: + "logs/titan/hmm/optSolution/copyOptSolution.log" + shell: + """ + curDir=`pwd` + for i in `cut -f11 {input} | grep -v "path"`; + do + echo -e "Creating sym links for $curDir/${{i}} to {output}" + ln -s ${{curDir}}/${{i}}* {output} + done + """ + + \ No newline at end of file diff --git a/scripts/snakemake/code/combineTITAN-ichor.R b/scripts/snakemake/code/combineTITAN-ichor.R new file mode 100644 index 0000000..a3d1dbb --- /dev/null +++ b/scripts/snakemake/code/combineTITAN-ichor.R @@ -0,0 +1,220 @@ +#' combineTITAN-ichor.R +#' author: Gavin Ha +#' institution: Fred Hutchinson Cancer Research Center +#' contact: +#' date: July 23, 2018 + +#' requires R-3.3+ +#' @import data.table +#' @import GenomicRanges +#' @import stringr +#' @import optparse + +library(optparse) + +option_list <- list( + make_option(c("--titanSeg"), type="character", help="TitanCNA segs.txt file. Required."), + make_option(c("--titanBin"), type="character", help="TitanCNA titan.txt file. Required."), + make_option(c("--titanParams"), type="character", help="TitanCNA params.txt file. Required."), + make_option(c("--ichorSeg"), type="character", help="ichorCNA segs.txt file. Required."), + make_option(c("--ichorBin"), type="character", help="ichorCNA cna.seg file. Required."), + make_option(c("--ichorParams"), type="character", help="ichorCNA params.txt file. Required."), + make_option(c("--ichorNormPanel"), type="character", help="Panel of normals; bin-level black list."), + make_option(c("--sex"), type="character", default="female", help="female or male. Default [%default]."), + make_option(c("--libdir"), type="character", help="TitanCNA directory path to source R files if custom changes made."), + make_option(c("--outSegFile"), type="character", help="New combined segment file. Required"), + make_option(c("--outBinFile"), type="character", help="New combined bin-level file. Required"), + make_option(c("--centromere"), type="character", default=NULL, help="Centromere table.") + ) + +parseobj <- OptionParser(option_list=option_list, usage = "usage: Rscript %prog [options]") +opt <- parse_args(parseobj) +print(opt) + +titanSeg <- opt$titanSeg +titanBin <- opt$titanBin +titanParams <- opt$titanParams +ichorSeg <- opt$ichorSeg +ichorBin <- opt$ichorBin +ichorParams <- opt$ichorParams +ichorNormPanel <- opt$ichorNormPanel +gender <- opt$sex +outSegFile <- opt$outSegFile +outBinFile <- opt$outBinFile +centromere <- opt$centromere +libdir <- opt$libdir +outImageFile <- gsub(".seg.txt", ".RData", outSegFile) + +library(TitanCNA) +library(stringr) +library(data.table) +library(GenomicRanges) + +if (!is.null(libdir) && libdir != "None"){ + source(paste0(libdir, "/R/utils.R")) +} + +options(stringsAsFactors=F, width=150, scipen=999) +save.image(outImageFile) +## copy number state mappings ## +ichorCNmap <- list("0"="HOMD", "1"="DLOH", "2"="NEUT", "3"="GAIN", "4"="AMP", "5"="AMP") +maxichorcn <- 5 + +## load segments +titan <- fread(titanSeg) +if (length(titan[["Cellular_Frequency"]] == 0)){ + setnames(titan, "Cellular_Frequency", "Cellular_Prevalence") +} +ichor <- fread(ichorSeg) +setnames(ichor, c("ID", "chrom", "start", "end", "num.mark", "seg.median.logR", "copy.number", "call"), + c("Sample", "Chromosome", "Start_Position.bp.", "End_Position.bp.", + "Length.snp.", "Median_logR", "Copy_Number", "TITAN_call")) + +## load data points ## +titan.cn <- fread(titanBin) +titan.cn <- cbind(Sample=titan[1,Sample], titan.cn) +#titan.cn[, chr := as.character(Chr)] +id <- titan[1, Sample] +ichor.cn <- fread(ichorBin) +ichor.cn <- cbind(Sample = id, ichor.cn) +#ichor.cn[, CopyNumber := state - 1] +ichor.cn[, Position := start] +setnames(ichor.cn, c("chr", "start", paste0(id,".copy.number"), paste0(id,".event"), paste0(id,".logR"), "end"), + c("Chr", "Start", "CopyNumber", "TITANcall", "LogRatio", "End")) + + +## get chromosome style +titan$Chromosome <- as.character(titan$Chromosome) +titan.cn$Chr <- as.character(titan.cn$Chr) +genomeStyle <- seqlevelsStyle(titan.cn$Chr)[1] +chrs <- c(1:22, "X") +seqlevelsStyle(chrs) <- genomeStyle + +## load parameters ## +params <- read.delim(titanParams, header=F, as.is=T) +purity <- 1 - as.numeric(params[1,2]) +ploidyT <- as.numeric(params[2,2]) +ploidy <- purity * ploidyT + (1-purity) * 2 +params.ichor <- read.delim(ichorParams, header=T, as.is=T) +homd.var <- as.numeric(strsplit(params[params[,1]=="logRatio Gaussian variance:",2], " ")[[1]][1]) +homd.sd <- sqrt(homd.var) + +## get gender +if (is.null(gender) || gender == "None"){ + gender <- params.ichor[3, 2] +} + +## get bin overlap with SNPs - include ichor bins even if no SNPs overlap +titan.gr <- titan.cn[, .(Chr, Position)] +titan.gr[, Start := Position]; titan.gr[, End := Position] +titan.gr <- as(titan.gr, "GRanges") +ichor.gr <- as(ichor.cn, "GRanges") +hits <- findOverlaps(query = titan.gr, subject = ichor.gr) +titan.cn[queryHits(hits), Start := ichor.cn[subjectHits(hits), Start]] +titan.cn[queryHits(hits), End := ichor.cn[subjectHits(hits), End]] +titan.ichor.cn <- merge(titan.cn, ichor.cn, by=c("Sample", "Chr", "Start", "End"), all=T, suffix=c("",".ichor")) +titan.ichor.cn[is.na(LogRatio), LogRatio := LogRatio.ichor] # assign ichor log ratio to missing titan SNPs +titan.ichor.cn <- titan.ichor.cn[, -c(grep("ichor", colnames(titan.ichor.cn),value=T)), with=F] + +## combine TITAN (chr1-22) and ichorCNA (chrX) segments and bin/SNP level data ## +## if male only ## +if (gender == "male"){ + cn <- rbind(titan.ichor.cn[Chr %in% chrs[1:22]], ichor.cn[Chr == chrs[grep("X", chrs)]], fill = TRUE) + segs <- rbind(titan[Chromosome %in% chrs[1:22]], ichor[Chromosome == chrs[grep("X", chrs)]], fill = TRUE) +}else{ + cn <- titan.ichor.cn + segs <- titan +} + +## sort column order +setnames(segs, c("Start_Position.bp.", "End_Position.bp."), c("Start", "End")) +cols <- c("Sample", "Chr", "Position", "Start", "End") +setcolorder(cn, c(cols, colnames(cn)[!colnames(cn) %in% cols])) + +## get major/minor CN from segs and place in SNP/level data ## +#cn.gr <- cn[, .(Chr, Start, End)] +#cn.gr <- as(na.omit(cn.gr), "GRanges") +#segs.gr <- as(segs, "GRanges") +#hits <- findOverlaps(query = cn.gr, subject = segs.gr) +#cn[queryHits(hits), MajorCN := segs[subjectHits(hits), MajorCN]] +#cn[queryHits(hits), MinorCN := segs[subjectHits(hits), MinorCN]] +#cn[is.na(CopyNumber), CopyNumber := MajorCN + MinorCN] + +## remove LogRatio for bins that do not overlap segments (i.e. remaining rows with CopyNumber == NA +# these regions that are not part of segments are usually noisy +#cn[is.na(CopyNumber), LogRatio := NA] + +## filter outlier HOMD data points +message("Filtering bins with outlier negative log ratios...") +homdLenThres <- 10000 +homdNumSNPThres <- 40 +homdLogRThres.auto <- log2((2*(1-purity)) / (2*(1-purity) + ploidyT*purity)) - 1*homd.sd +## OUTLIERS IN CHROMOSOME X SHOULD BE AGNOISTIC OF SEX +#if (gender == "male"){ +# homdLogRThres.X <- round(log2((1*(1-purity)) / (1*(1-purity) + ploidyT*purity)), digits = 1) - 0.1 +#}else{ +# homdLogRThres.X <- homdLogRThres.auto +#} +ind.homd.cn <- cn[LogRatio < homdLogRThres.auto, which = TRUE] +message("Removing ", length(ind.homd.cn), " negative log ratio outlier bins ...") +ind.segs.remove <- segs[((End-Start < homdLenThres | Length.snp. < homdNumSNPThres) & Corrected_Call == "HOMD") | Median_logR < homdLogRThres.auto, which=T] +message("Removing ", length(ind.segs.remove), " negative log ratio outlier segments ...") +if (length(ind.segs.remove) > 0){ + segsToRemove <- segs[ind.segs.remove] + hits <- findOverlaps(query = as(as.data.frame(segsToRemove), "GRanges"), subject = as(as.data.frame(cn), "GRanges")) + ind.homd.segs <- union(subjectHits(hits), ind.homd.cn) +}else{ + ind.homd.segs <- ind.homd.cn +} + +#message("Loading panel of normals: ", ichorNormPanel) +#panel <- readRDS(ichorNormPanel) +#panel.dt <- as.data.table(as.data.frame(panel)) +#panel.colNames <- colnames(panel.dt[, 11:(ncol(panel.dt)-1)]) +#panel.dt[, variance := apply(.SD, 1, var, na.rm=TRUE), .SDcols = panel.colNames] +#panel.sd <- 2 * sd(panel.dt$variance, na.rm=T) +#panel.dt[, outlier := variance < -panel.sd | variance > panel.sd] +#ind.panel <- which(panel$Median < -2*sd(x$panel,na.rm=T)) +#ind.panel <- which(panel.dt$outlier == TRUE) +#hits <- findOverlaps(query = as(as.data.frame(cn), "GRanges"), subject = as(panel[ind.panel,], "GRanges")) +#ind.homd.panel <- queryHits(hits) +#ind.homd.all <- union(ind.homd.panel, ind.homd.segs) +# remove the elements +if (length(ind.homd.segs) > 0){ + cn <- cn[-ind.homd.segs] +} +#cn <- cn[ind.homd.segs, FILTER := "EXCLUDE"] +if (length(ind.segs.remove) > 0){ + segs <- segs[-ind.segs.remove] +} +#segs <- segs[ind.segs.remove, FILTER := "EXCLUDE"] + +## correct copy number beyond maximum CN state based on purity and logR +correctCN <- correctIntegerCN(cn, segs, purity, ploidyT, maxCNtoCorrect.autosomes = NULL, + maxCNtoCorrect.X = NULL, correctHOMD = TRUE, minPurityToCorrect = 0.05, gender = gender, chrs = chrs) +segs <- correctCN$segs +cn <- correctCN$cn +## extend segments to remove gaps +centromeres <- fread(centromere) +segs <- extendSegments(segs, removeCentromeres = TRUE, centromeres = centromeres, extendToTelomeres = FALSE, + chrs = chrs, genomeStyle = genomeStyle) + +## write segments to file ## +write.table(segs, file = outSegFile, col.names=T, row.names=F, quote=F, sep="\t") +write.table(cn, file = outBinFile, col.names=T, row.names=F, quote=F, sep="\t") +## write segments without germline SNPs +outSegNoSNPFile <- gsub(".txt", ".noSNPs.txt", outSegFile) +write.table(segs[, -c("Start.snp", "End.snp")], file = outSegNoSNPFile, col.names=T, row.names=F, quote=F, sep="\t") + +## write segments in IGV / GISTIC format ## +igv <- segs[, .(Sample, Chromosome, Start.snp, End.snp, Length.snp., logR_Copy_Number)] +igv[Chromosome %in% chrs[1:22], Corrected.logR := log2(logR_Copy_Number / 2)] +igv[Chromosome == chrs[grep("X", chrs)], Corrected.logR := log2(logR_Copy_Number / 1)] +igv[, logR_Copy_Number := NULL] +outIGVFile <- gsub("seg.txt", "segIGV.txt", outSegFile) +write.table(igv, file = outIGVFile, col.names=T, row.names=F, quote=F, sep="\t") + +save.image(outImageFile) + + + diff --git a/scripts/snakemake/config/config.yaml b/scripts/snakemake/config/config.yaml index 8d21c4e..c7aacdf 100644 --- a/scripts/snakemake/config/config.yaml +++ b/scripts/snakemake/config/config.yaml @@ -7,6 +7,7 @@ ichorCNA_rscript: /path/to/ichorCNA.R ichorCNA_libdir: /path/to/ichorCNA_code pyCountScript: code/countPysam.py TitanCNA_rscript: ../R_scripts/titanCNA.R +TitanCNA_combineTitanIchorCNA: code/combineTITAN-ichor.R TitanCNA_selectSolutionRscript: ../R_scripts/selectSolution.R TitanCNA_libdir: ../../R/