combine titan and ichorCNA results in snakemake

An additional rule is added to TitanCNA.snakefile to combine the results of TITAN with ichorCNA. In particular, chrX results from ichorCNA is include for male samples because TitanCNA does not include chrX for males. Addresses issues #53 To some extent is related to #72
gavinha · May 11, 2019 · de43255 · de43255
1 parent 9470fdc
commit de43255
Show file tree

Hide file tree

Showing 4 changed files with 270 additions and 15 deletions.
diff --git a/scripts/snakemake/README.md b/scripts/snakemake/README.md
@@ -7,13 +7,13 @@ This workflow will run the TITAN a set of tumour-normal pairs, starting from the
 Gavin Ha  
 Fred Hutchinson Cancer Research Center  
 contact: <[email protected]> or <[email protected]>  
-Date: January 30, 2019  
+Date: May 11, 2019  
 Website: [GavinHaLab.org](https://gavinhalab.org/)
 
 ## Requirements
 ### Software packages or libraries
  - R-3.5
-   - TitanCNA (v1.15.0)
+   - TitanCNA (v1.15.0+)
    		- TitanCNA imports: GenomicRanges, dplyr, data.table, doMC
    - [ichorCNA](<https://github.com/broadinstitute/ichorCNA>) (v0.1.0) 
    - HMMcopy
@@ -118,8 +118,9 @@ Users can run the snakemake files individually. This can be helpful for testing
   ``` 
   ### c. [TitanCNA.snakefile](TitanCNA.snakefile)
   i.   Run the [TitanCNA](https://github.com/gavinha/TitanCNA) analysis and generates solutions for different ploidy initializations and each clonal cluster.  
-  ii.  Merge results with ichorCNA output generate by [ichorCNA.snakefile](ichorCNA.snakefile) and post-processes copy number results.  
+  ii.  Merge results with ichorCNA output generate by [ichorCNA.snakefile](ichorCNA.snakefile) and post-processes copy number results. In particular, it combines chrX results from ichorCNA for male samples.
   iii. Select optimal solution for each samples and copies these to a new folder. The parameters are compiled in a text file.  
+  iv. Creates a new directory containing symbolic links to result files for optimal solutions of each sample.
 
 # Configuration and settings
 All settings for the workflow are contained in [config/config.yaml](config/config.yaml). The settings are organized by paths to scripts and reference files and then by each step in the workflow.
@@ -137,6 +138,7 @@ readCounterScript:  /path/to/readCounter
 ichorCNA_rscript:  /path/to/ichorCNA.R
 pyCountScript:  code/countPysam.py
 TitanCNA_rscript: ../R_scripts/titanCNA.R
+TitanCNA_combineTitanIchorCNA:  code/combineTITAN-ichor.R
 TitanCNA_selectSolutionRscript: ../R_scripts/selectSolution.R
 ```
 See the [ichorCNA](https://github.com/broadinstitute/ichorCNA/blob/master/scripts/runIchorCNA.R) repo for the ichorCNA R script.

diff --git a/scripts/snakemake/TitanCNA.snakefile b/scripts/snakemake/TitanCNA.snakefile
@@ -12,15 +12,10 @@ PLOIDY = {2:[2], 3:[2,3], 4:[2,3,4]}
 rule all:
 	input: 
 		expand("results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.txt", tumor=config["pairings"], clustNum=CLUST[config["TitanCNA_maxNumClonalClusters"]], ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]),
-		#expand("results/titan/hmm/titanCNA_ploidy{ploidy}/", ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]),
-		"results/titan/hmm/optimalClusterSolution.txt"
-
-#rule makeOutDir:
-#	output:
-#		"results/titan/hmm/titanCNA_ploidy{ploidy}/"
-#	params:
-#	shell:
-#		"mkdir -p {output}"
+		expand("results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.seg.txt", tumor=config["pairings"], clustNum=CLUST[config["TitanCNA_maxNumClonalClusters"]], ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]),
+		expand("results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.cna.txt", tumor=config["pairings"], clustNum=CLUST[config["TitanCNA_maxNumClonalClusters"]], ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]),
+		"results/titan/hmm/optimalClusterSolution.txt",
+		"results/titan/hmm/optimalClusterSolution/"
 
 rule runTitanCNA:
 	input:
@@ -56,9 +51,27 @@ rule runTitanCNA:
 	shell:
 		"Rscript {params.titanRscript} --hetFile {input.alleleCounts} --cnFile {input.corrDepth} --outFile {output.titan} --outSeg {output.segTxt} --outParam {output.param} --outIGV {output.seg} --outPlotDir {params.outRoot} --libdir {params.libdir} --id {wildcards.tumor} --numClusters {wildcards.clustNum} --numCores {params.numCores} --normal_0 {params.normal} --ploidy_0 {wildcards.ploidy} --genomeStyle {params.genomeStyle} --genomeBuild {params.genomeBuild} --cytobandFile {params.cytobandFile} --chrs \"{params.chrs}\" --gender {params.sex} --estimateNormal {params.estimateNormal} --estimatePloidy {params.estimatePloidy} --estimateClonality {params.estimateClonality}  --centromere {params.centromere} --alphaK {params.alphaK} --txnExpLen {params.txnExpLen} --plotYlim \"{params.plotYlim}\" > {log} 2> {log}"
 
-#--alleleModel {params.alleleModel} --alphaR {params.alphaR}
+rule combineTitanAndIchorCNA:
+	input:
+		titanSeg="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.segs.txt", 
+		titanBin="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.txt",
+		titanParam="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.params.txt",
+		ichorSeg="results/ichorCNA/{tumor}/{tumor}.seg.txt",
+		ichorBin="results/ichorCNA/{tumor}/{tumor}.cna.seg",
+		ichorParam="results/ichorCNA/{tumor}/{tumor}.params.txt"
+	output:
+		segFile="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.seg.txt",
+		binFile="results/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.titan.ichor.cna.txt",
+	params:
+		combineScript=config["TitanCNA_combineTitanIchorCNA"],
+		libdir=config["TitanCNA_libdir"],
+		centromere=config["centromere"],
+		sex=config["sex"]
+	log:
+		"logs/titan/hmm/titanCNA_ploidy{ploidy}/{tumor}_cluster{clustNum}.combineTitanIchorCNA.log"
+	shell:
+		"Rscript {params.combineScript} --libdir {params.libdir} --titanSeg {input.titanSeg} --titanBin {input.titanBin} --titanParam {input.titanParam} --ichorSeg {input.ichorSeg} --ichorBin {input.ichorBin} --ichorParam {input.ichorParam} --sex {params.sex} --outSegFile {output.segFile} --outBinFile {output.binFile} --centromere {params.centromere} > {log} 2> {log}"	
 
-
 rule selectSolution:
 	input:
 		#ploidyDirs=expand("results/titan/hmm/titanCNA_ploidy{ploidy}/", ploidy=PLOIDY[config["TitanCNA_maxPloidy"]]),
@@ -85,4 +98,23 @@ rule selectSolution:
 		fi
 		Rscript {params.solutionRscript} --ploidyRun2 $ploidyRun2 --ploidyRun3 $ploidyRun3 --ploidyRun4 $ploidyRun4 --threshold {params.threshold} --outFile {output} > {log} 2> {log}
 		"""
-
+
+rule copyOptSolution:
+	input:
+		"results/titan/hmm/optimalClusterSolution.txt"
+	output:
+		directory("results/titan/hmm/optimalClusterSolution/")
+	params:
+	log:
+		"logs/titan/hmm/optSolution/copyOptSolution.log"
+	shell:
+		"""
+		curDir=`pwd`
+		for i in `cut -f11 {input} | grep -v "path"`;
+		do
+			echo -e "Creating sym links for $curDir/${{i}} to {output}"
+			ln -s ${{curDir}}/${{i}}* {output}
+		done		
+		"""
+
+
diff --git a/scripts/snakemake/code/combineTITAN-ichor.R b/scripts/snakemake/code/combineTITAN-ichor.R
@@ -0,0 +1,220 @@
+#' combineTITAN-ichor.R
+#' author: Gavin Ha 
+#' institution: Fred Hutchinson Cancer Research Center
+#' contact: <[email protected]>
+#' date: July 23, 2018
+
+#' requires R-3.3+
+#' @import data.table
+#' @import GenomicRanges
+#' @import stringr
+#' @import optparse
+
+library(optparse)
+
+option_list <- list(
+	make_option(c("--titanSeg"), type="character", help="TitanCNA segs.txt file. Required."),
+	make_option(c("--titanBin"), type="character", help="TitanCNA titan.txt file. Required."),
+	make_option(c("--titanParams"), type="character", help="TitanCNA params.txt file. Required."),
+  	make_option(c("--ichorSeg"), type="character", help="ichorCNA segs.txt file. Required."),
+	make_option(c("--ichorBin"), type="character", help="ichorCNA cna.seg file. Required."),
+	make_option(c("--ichorParams"), type="character", help="ichorCNA params.txt file. Required."),
+	make_option(c("--ichorNormPanel"), type="character", help="Panel of normals; bin-level black list."),
+	make_option(c("--sex"), type="character", default="female", help="female or male. Default [%default]."),
+	make_option(c("--libdir"), type="character", help="TitanCNA directory path to source R files if custom changes made."),
+  	make_option(c("--outSegFile"), type="character", help="New combined segment file. Required"),
+  	make_option(c("--outBinFile"), type="character", help="New combined bin-level file. Required"),
+  	make_option(c("--centromere"), type="character", default=NULL, help="Centromere table.")  
+  )
+
+parseobj <- OptionParser(option_list=option_list, usage = "usage: Rscript %prog [options]")
+opt <- parse_args(parseobj)
+print(opt)
+
+titanSeg <- opt$titanSeg
+titanBin <- opt$titanBin
+titanParams <- opt$titanParams
+ichorSeg <- opt$ichorSeg
+ichorBin <- opt$ichorBin
+ichorParams <- opt$ichorParams
+ichorNormPanel <- opt$ichorNormPanel
+gender <- opt$sex
+outSegFile <- opt$outSegFile
+outBinFile <- opt$outBinFile
+centromere <- opt$centromere
+libdir <- opt$libdir
+outImageFile <- gsub(".seg.txt", ".RData", outSegFile)
+
+library(TitanCNA)
+library(stringr)
+library(data.table)
+library(GenomicRanges)
+
+if (!is.null(libdir) && libdir != "None"){
+	source(paste0(libdir, "/R/utils.R"))
+}
+
+options(stringsAsFactors=F, width=150, scipen=999)
+save.image(outImageFile)
+## copy number state mappings ##
+ichorCNmap <- list("0"="HOMD", "1"="DLOH", "2"="NEUT", "3"="GAIN", "4"="AMP", "5"="AMP")
+maxichorcn <- 5
+
+## load segments 
+titan <- fread(titanSeg)
+if (length(titan[["Cellular_Frequency"]] == 0)){
+	setnames(titan, "Cellular_Frequency", "Cellular_Prevalence")
+}
+ichor <- fread(ichorSeg)
+setnames(ichor, c("ID", "chrom", "start", "end", "num.mark", "seg.median.logR", "copy.number", "call"), 
+		c("Sample", "Chromosome", "Start_Position.bp.", "End_Position.bp.", 
+		  "Length.snp.", "Median_logR", "Copy_Number", "TITAN_call"))
+
+## load data points ##
+titan.cn <- fread(titanBin)
+titan.cn <- cbind(Sample=titan[1,Sample], titan.cn)
+#titan.cn[, chr := as.character(Chr)]
+id <- titan[1, Sample]
+ichor.cn <- fread(ichorBin)
+ichor.cn <- cbind(Sample = id, ichor.cn)
+#ichor.cn[, CopyNumber := state - 1]
+ichor.cn[, Position := start]
+setnames(ichor.cn, c("chr", "start", paste0(id,".copy.number"), paste0(id,".event"), paste0(id,".logR"), "end"), 
+		c("Chr", "Start", "CopyNumber", "TITANcall", "LogRatio", "End"))
+
+
+## get chromosome style
+titan$Chromosome <- as.character(titan$Chromosome)
+titan.cn$Chr <- as.character(titan.cn$Chr)
+genomeStyle <- seqlevelsStyle(titan.cn$Chr)[1]
+chrs <- c(1:22, "X")
+seqlevelsStyle(chrs) <- genomeStyle
+
+## load parameters ##
+params <- read.delim(titanParams, header=F, as.is=T)
+purity <- 1 - as.numeric(params[1,2])
+ploidyT <- as.numeric(params[2,2])
+ploidy <- purity * ploidyT + (1-purity) * 2
+params.ichor <- read.delim(ichorParams, header=T, as.is=T)
+homd.var <- as.numeric(strsplit(params[params[,1]=="logRatio Gaussian variance:",2], " ")[[1]][1])
+homd.sd <- sqrt(homd.var)
+
+## get gender
+if (is.null(gender) || gender == "None"){
+	gender <- params.ichor[3, 2]
+}
+
+## get bin overlap with SNPs - include ichor bins even if no SNPs overlap 
+titan.gr <- titan.cn[, .(Chr, Position)]
+titan.gr[, Start := Position]; titan.gr[, End := Position]
+titan.gr <- as(titan.gr, "GRanges")
+ichor.gr <- as(ichor.cn, "GRanges")
+hits <- findOverlaps(query = titan.gr, subject = ichor.gr)
+titan.cn[queryHits(hits), Start := ichor.cn[subjectHits(hits), Start]]
+titan.cn[queryHits(hits), End := ichor.cn[subjectHits(hits), End]]
+titan.ichor.cn <- merge(titan.cn, ichor.cn, by=c("Sample", "Chr", "Start", "End"), all=T, suffix=c("",".ichor"))
+titan.ichor.cn[is.na(LogRatio), LogRatio := LogRatio.ichor] # assign ichor log ratio to missing titan SNPs
+titan.ichor.cn <- titan.ichor.cn[, -c(grep("ichor", colnames(titan.ichor.cn),value=T)), with=F] 
+
+## combine TITAN (chr1-22) and ichorCNA (chrX) segments and bin/SNP level data ##
+## if male only ##
+if (gender == "male"){
+	cn <- rbind(titan.ichor.cn[Chr %in% chrs[1:22]], ichor.cn[Chr == chrs[grep("X", chrs)]], fill = TRUE)
+	segs <- rbind(titan[Chromosome %in% chrs[1:22]], ichor[Chromosome == chrs[grep("X", chrs)]], fill = TRUE)
+}else{
+	cn <- titan.ichor.cn
+	segs <- titan
+}
+
+## sort column order
+setnames(segs, c("Start_Position.bp.", "End_Position.bp."), c("Start", "End"))
+cols <- c("Sample", "Chr", "Position", "Start", "End")
+setcolorder(cn, c(cols, colnames(cn)[!colnames(cn) %in% cols]))
+
+## get major/minor CN from segs and place in SNP/level data ##
+#cn.gr <- cn[, .(Chr, Start, End)]
+#cn.gr <- as(na.omit(cn.gr), "GRanges")
+#segs.gr <- as(segs, "GRanges")
+#hits <- findOverlaps(query = cn.gr, subject = segs.gr)
+#cn[queryHits(hits), MajorCN := segs[subjectHits(hits), MajorCN]]
+#cn[queryHits(hits), MinorCN := segs[subjectHits(hits), MinorCN]]
+#cn[is.na(CopyNumber), CopyNumber := MajorCN + MinorCN]
+
+## remove LogRatio for bins that do not overlap segments (i.e. remaining rows with CopyNumber == NA
+# these regions that are not part of segments are usually noisy
+#cn[is.na(CopyNumber), LogRatio := NA]
+
+## filter outlier HOMD data points
+message("Filtering bins with outlier negative log ratios...")
+homdLenThres <- 10000
+homdNumSNPThres <- 40
+homdLogRThres.auto <- log2((2*(1-purity)) / (2*(1-purity) + ploidyT*purity)) - 1*homd.sd
+## OUTLIERS IN CHROMOSOME X SHOULD BE AGNOISTIC OF SEX 
+#if (gender == "male"){
+#	homdLogRThres.X <- round(log2((1*(1-purity)) / (1*(1-purity) + ploidyT*purity)), digits = 1) - 0.1
+#}else{
+#	homdLogRThres.X <- homdLogRThres.auto
+#}
+ind.homd.cn <- cn[LogRatio < homdLogRThres.auto, which = TRUE]
+message("Removing ", length(ind.homd.cn), " negative log ratio outlier bins ...")
+ind.segs.remove <- segs[((End-Start < homdLenThres | Length.snp. < homdNumSNPThres) & Corrected_Call == "HOMD") | Median_logR < homdLogRThres.auto, which=T]
+message("Removing ", length(ind.segs.remove), " negative log ratio outlier segments ...")
+if (length(ind.segs.remove) > 0){
+	segsToRemove <- segs[ind.segs.remove]
+	hits <- findOverlaps(query = as(as.data.frame(segsToRemove), "GRanges"), subject = as(as.data.frame(cn), "GRanges"))
+	ind.homd.segs <- union(subjectHits(hits), ind.homd.cn)
+}else{
+	ind.homd.segs <- ind.homd.cn
+}
+
+#message("Loading panel of normals: ", ichorNormPanel)
+#panel <- readRDS(ichorNormPanel)
+#panel.dt <- as.data.table(as.data.frame(panel))
+#panel.colNames <- colnames(panel.dt[, 11:(ncol(panel.dt)-1)])
+#panel.dt[, variance := apply(.SD, 1, var, na.rm=TRUE), .SDcols = panel.colNames]
+#panel.sd <- 2 * sd(panel.dt$variance, na.rm=T)
+#panel.dt[, outlier := variance < -panel.sd | variance > panel.sd]
+#ind.panel <- which(panel$Median < -2*sd(x$panel,na.rm=T))
+#ind.panel <- which(panel.dt$outlier == TRUE)
+#hits <- findOverlaps(query = as(as.data.frame(cn), "GRanges"), subject = as(panel[ind.panel,], "GRanges"))
+#ind.homd.panel <- queryHits(hits)
+#ind.homd.all <- union(ind.homd.panel, ind.homd.segs)
+# remove the elements 
+if (length(ind.homd.segs) > 0){
+	cn <- cn[-ind.homd.segs]
+}
+#cn <- cn[ind.homd.segs, FILTER := "EXCLUDE"]
+if (length(ind.segs.remove) > 0){
+	segs <- segs[-ind.segs.remove]
+}
+#segs <- segs[ind.segs.remove, FILTER := "EXCLUDE"]
+
+## correct copy number beyond maximum CN state based on purity and logR
+correctCN <- correctIntegerCN(cn, segs, purity, ploidyT, maxCNtoCorrect.autosomes = NULL, 
+		maxCNtoCorrect.X = NULL, correctHOMD = TRUE, minPurityToCorrect = 0.05, gender = gender, chrs = chrs)
+segs <- correctCN$segs
+cn <- correctCN$cn
+## extend segments to remove gaps
+centromeres <- fread(centromere)
+segs <- extendSegments(segs, removeCentromeres = TRUE, centromeres = centromeres, extendToTelomeres = FALSE,
+	chrs = chrs, genomeStyle = genomeStyle)
+
+## write segments to file ##
+write.table(segs, file = outSegFile, col.names=T, row.names=F, quote=F, sep="\t")
+write.table(cn, file = outBinFile, col.names=T, row.names=F, quote=F, sep="\t")
+## write segments without germline SNPs
+outSegNoSNPFile <- gsub(".txt", ".noSNPs.txt", outSegFile)
+write.table(segs[, -c("Start.snp", "End.snp")], file = outSegNoSNPFile, col.names=T, row.names=F, quote=F, sep="\t")
+
+## write segments in IGV / GISTIC format ##
+igv <- segs[, .(Sample, Chromosome, Start.snp, End.snp, Length.snp., logR_Copy_Number)]
+igv[Chromosome %in% chrs[1:22], Corrected.logR := log2(logR_Copy_Number / 2)]
+igv[Chromosome == chrs[grep("X", chrs)], Corrected.logR := log2(logR_Copy_Number / 1)]
+igv[, logR_Copy_Number := NULL]
+outIGVFile <- gsub("seg.txt", "segIGV.txt", outSegFile)
+write.table(igv, file = outIGVFile, col.names=T, row.names=F, quote=F, sep="\t")
+
+save.image(outImageFile)
+
+
+
diff --git a/scripts/snakemake/config/config.yaml b/scripts/snakemake/config/config.yaml
@@ -7,6 +7,7 @@ ichorCNA_rscript:  /path/to/ichorCNA.R
 ichorCNA_libdir:  /path/to/ichorCNA_code
 pyCountScript:  code/countPysam.py
 TitanCNA_rscript: ../R_scripts/titanCNA.R
+TitanCNA_combineTitanIchorCNA:  code/combineTITAN-ichor.R
 TitanCNA_selectSolutionRscript: ../R_scripts/selectSolution.R
 TitanCNA_libdir:  ../../R/