diff --git a/demo/Snakefile b/demo/Snakefile index 6f187275..bf53d090 100755 --- a/demo/Snakefile +++ b/demo/Snakefile @@ -41,7 +41,7 @@ configfile: "../modules/lofreq/1.0/config/default.yaml" configfile: "../modules/starfish/2.0/config/default.yaml" configfile: "../modules/sage/1.0/config/default.yaml" configfile: "../modules/slms_3/1.0/config/default.yaml" - +configfile: "../modules/ichorcna/1.0/config/default.yaml" # Load project-specific config, which includes the shared # configuration and some module-specific config updates @@ -76,7 +76,7 @@ include: "../modules/controlfreec/1.1/controlfreec.smk" include: "../modules/lofreq/1.0/lofreq.smk" include: "../modules/starfish/2.0/starfish.smk" include: "../modules/sage/1.0/sage.smk" - +include: "../modules/ichorcna/1.0/ichorcna.smk" ##### TARGETS ###### @@ -91,9 +91,13 @@ rule all: rules._lofreq_all.input, rules._strelka_all.input, rules._bwa_mem_all.input, + rules._liftover_all.input, + rules._controlfreec_all.input, rules._gridss_all.input, rules._controlfreec_all.input, rules._starfish_all.input, rules._vcf2maf_all.input, rules._sage_all.input, - rules._slms_3_all.input + rules._slms_3_all.input, + rules._ichorcna_all.input + diff --git a/demo/config.yaml b/demo/config.yaml index 011540c2..5069451c 100755 --- a/demo/config.yaml +++ b/demo/config.yaml @@ -139,6 +139,17 @@ lcr-modules: inputs: sample_bam: "data/{sample_id}.bam" sample_bai: "data/{sample_id}.bam.bai" + + liftover: + tool: "battenberg" + inputs: + sample_seg: "data/{tool}/hg38/{tumour_sample_id}--{normal_sample_id}_subclones.igv.seg" + + ichorcna: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + scratch_subdirectories: [] # mpileup should be in scratch space starfish: diff --git a/modules/ichorcna/1.0/config/default.yaml b/modules/ichorcna/1.0/config/default.yaml new file mode 100644 index 00000000..37a33898 --- /dev/null +++ b/modules/ichorcna/1.0/config/default.yaml @@ -0,0 +1,107 @@ +lcr-modules: + + ichorcna: + + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + + + scratch_subdirectories: [] + + options: + readcounter: + qual: 20 # only includes reads with mapping quality greater than 20 + binSize: 1000000 # set window size to compute coverage + # available binSizes are: 1000000, 500000, 50000, 10000 + run: + ichorCNA_libdir: "" + ichorCNA_rscript: "{MODSDIR}/src/runIchorCNA.R" + # use panel matching same bin size (optional) + ichorCNA_normalPanel: + "1000000": "inst/extdata/HD_ULP_PoN_{genome_build}_1Mb_median_normAutosome_median.rds" + "500000": "inst/extdata/HD_ULP_PoN_{genome_build}_500kb_median_normAutosome_median.rds" + # must use gc wig file corresponding to same binSize (required) + ichorCNA_gcWig: + "1000000": "inst/extdata/gc_{genome_build}_1000kb.wig" + "500000": "inst/extdata/gc_{genome_build}_500kb.wig" + "50000": "inst/extdata/gc_{genome_build}_50kb.wig" + "10000": "inst/extdata/gc_{genome_build}_10kb.wig" + # must use map wig file corresponding to same binSize (required) + ichorCNA_mapWig: + "1000000": "inst/extdata/map_{genome_build}_1000kb.wig" + "500000": "inst/extdata/map_{genome_build}_500kb.wig" + "50000": "inst/extdata/map_{genome_build}_50kb.wig" + "10000": "inst/extdata/map_{genome_build}_10kb.wig" + # use bed file if sample has targeted regions, eg. exome data (optional) + ichorCNA_exons: NULL + ichorCNA_centromere: + grch37: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hg19: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hs37d5: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + grch38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + hg38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + ichorCNA_minMapScore: 0.75 + ichorCNA_fracReadsInChrYForMale: 0.002 # Threshold for fraction of reads in chrY to assign as male + ichorCNA_genomeStyle: # can set this to UCSC or NCBI + grch37: "NCBI" + hg19: "NCBI" + hs37d5: "NCBI" + grch38: "UCSC" + hg38: "UCSC" + # chrs used for training ichorCNA parameters, e.g. tumor fraction. + ichorCNA_chrTrain: + grch37: "c(1:22)" + hg19: "c(1:22)" + hs37d5: "c(1:22)" + grch38: "paste0('chr', c(1:22))" + hg38: "paste0('chr', c(1:22))" + # non-tumor fraction parameter restart values; higher values should be included for cfDNA + ichorCNA_normal: "c(0.5,0.6,0.7,0.8,0.9,0.95)" + # ploidy parameter restart values + ichorCNA_ploidy: "c(2,3,4)" + ichorCNA_estimateNormal: TRUE + ichorCNA_estimatePloidy: TRUE + ichorCNA_estimateClonality: TRUE + # states to use for subclonal CN + ichorCNA_scStates: "c(1,3)" + # set maximum copy number to use + ichorCNA_maxCN: 5 + # TRUE/FALSE to include homozygous deletion state # FALSE for low coverage libraries (ex. 0.1x) ; can turn on for higher coverage data (ex. >10x) + ichorCNA_includeHOMD: FALSE + # Exclude solutions if total length of subclonal CNAs > this fraction of the genome + ichorCNA_maxFracGenomeSubclone: 0.5 + # Exclude solutions if total length of subclonal CNAs > this fraction of total CNA length + ichorCNA_maxFracCNASubclone: 0.7 + # control segmentation - higher (e.g. 0.9999999) leads to higher specificity and fewer segments + # lower (e.g. 0.99) leads to higher sensitivity and more segments + ichorCNA_txnE: 0.9399999 + # control segmentation - higher (e.g. 10000000) leads to higher specificity and fewer segments + # lower (e.g. 100) leads to higher sensitivity and more segments + ichorCNA_txnStrength: 10000 + ichorCNA_plotFileType: "pdf" + ichorCNA_plotYlim: "c(-2,2)" + + + conda_envs: + ichorcna: "{MODSDIR}/envs/ichorcna.env.yaml" + hmmcopy_utils: "{MODSDIR}/envs/hmmcopy_utils.env.yaml" + + threads: + readcounter: 4 + run: 4 + + resources: + readcounter: + mem_mb: 2000 + bam: 1 + run: + mem_mb: 2000 + bam: 1 + + pairing_config: + genome: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: True diff --git a/modules/ichorcna/1.0/envs/ichorcna.env.yaml b/modules/ichorcna/1.0/envs/ichorcna.env.yaml new file mode 100644 index 00000000..208fd501 --- /dev/null +++ b/modules/ichorcna/1.0/envs/ichorcna.env.yaml @@ -0,0 +1,108 @@ +name: null +channels: + - conda-forge + - dranew + - bioconda + - defaults + - r +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-genomeinfodb=1.26.0 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicranges=1.42.0 + - bioconductor-hmmcopy=1.32.0 + - bioconductor-iranges=2.24.0 + - bioconductor-s4vectors=0.28.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - ca-certificates=2020.12.5 + - cairo=1.16.0 + - curl=7.71.1 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.0 + - hmmcopy_utils=0.0.1 + - icu=68.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.2 + - ld_impl_linux-64=2.35.1 + - libblas=3.8.0 + - libcblas=3.8.0 + - libcurl=7.71.1 + - libedit=3.1.20191231 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.66.7 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.8.0 + - libopenblas=0.3.10 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.13 + - libxml2=2.9.10 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1j + - pango=1.42.4 + - pcre=8.44 + - pcre2=10.36 + - pixman=0.40.0 + - pthread-stubs=0.4 + - r-base=4.0.3 + - r-bitops=1.0_6 + - r-data.table=1.14.0 + - r-getopt=1.20.3 + - r-ichorcna=0.2.0 + - r-optparse=1.6.6 + - r-plyr=1.8.6 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.2 + - readline=8.0 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.0 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.9 +prefix: /projects/rmorin/projects/tumour_contam/envs/ichorcna diff --git a/modules/ichorcna/1.0/ichorcna.smk b/modules/ichorcna/1.0/ichorcna.smk new file mode 100644 index 00000000..74c54f7d --- /dev/null +++ b/modules/ichorcna/1.0/ichorcna.smk @@ -0,0 +1,335 @@ +#!/usr/bin/env snakemake + + +# ---------------------------------------------------------------------------- # +##### ATTRIBUTION ##### +# ---------------------------------------------------------------------------- # + +# Original snakemake author: Jasper Wong +# Module author: Jasper Wong +# Additional contributors: N/A + + +# ---------------------------------------------------------------------------- # +##### SETUP ##### +# ---------------------------------------------------------------------------- # + +### Modules ### + +import pandas as pd +import numpy as np +import oncopipe as op +import glob +import os + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +### Directories ### +# Setup module and store module-specific configuration in `CFG`. +CFG = op.setup_module( + name = "ichorcna", + version = "1.0", + subdirectories = ["inputs", "readDepth", "seg", "outputs"] +) + +localrules: + _ichorcna_input_bam, + _ichorcna_output, + _ichorcna_all + +# ---------------------------------------------------------------------------- # +##### RULES ##### +# ---------------------------------------------------------------------------- # + +### Set-up dependencies and packages ### +# Download github and all external files for ichorCNA: (needed since their extdata is not complete for all genome builds) +rule _install_ichorcna: + output: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + outdir = CFG["dirs"]["inputs"] + "ichorCNA/" + conda: + CFG["conda_envs"]["ichorcna"] + shell: + op.as_one_line(""" + git clone git@github.com:broadinstitute/ichorCNA.git {params.outdir} && + touch {output.complete}""") + +# This defines the script/extdata directory used by ichorCNA in the subsequent rules: +ichorDir = CFG["dirs"]["inputs"] + "ichorCNA/inst/extdata/" + +# Symlinks the extdata appropriately +rule _setup_ichorcna_extdata: + input: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_1Mb_median_normAutosome_mapScoreFiltered_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_500kb_median_normAutosome_mapScoreFiltered_median.rds", + hg38_1Mb_rds = ichorDir + "HD_ULP_PoN_hg38_1Mb_median_normAutosome_median.rds", + hg38_500kb_rds = ichorDir + "HD_ULP_PoN_hg38_500kb_median_normAutosome_median.rds", + hg19_1000kb_gc = ichorDir + "gc_hg19_1000kb.wig", + hg19_500kb_gc = ichorDir + "gc_hg19_500kb.wig", + hg19_50kb_gc = ichorDir + "gc_hg19_50kb.wig", + hg19_10kb_gc = ichorDir + "gc_hg19_10kb.wig", + hg38_1000kb_gc = ichorDir + "gc_hg38_1000kb.wig", + hg38_500kb_gc = ichorDir + "gc_hg38_500kb.wig", + hg38_50kb_gc = ichorDir + "gc_hg38_50kb.wig", + hg38_10kb_gc = ichorDir + "gc_hg38_10kb.wig", + hg19_1000kb_map = ichorDir + "map_hg19_1000kb.wig", + hg19_500kb_map = ichorDir + "map_hg19_500kb.wig", + hg19_50kb_map = ichorDir + "map_hg19_50kb.wig", + hg19_10kb_map = ichorDir + "map_hg19_10kb.wig", + hg38_1000kb_map = ichorDir + "map_hg38_1000kb.wig", + hg38_500kb_map = ichorDir + "map_hg38_500kb.wig", + hg38_50kb_map = ichorDir + "map_hg38_50kb.wig", + hg38_10kb_map = ichorDir + "map_hg38_10kb.wig", + output: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_hg19_1Mb_median_normAutosome_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_hg19_500kb_median_normAutosome_median.rds", + grch37_1Mb_rds = ichorDir + "HD_ULP_PoN_grch37_1Mb_median_normAutosome_median.rds", + grch37_500kb_rds = ichorDir + "HD_ULP_PoN_grch37_500kb_normAutosome_median.rds", + hs37d5_1Mb_rds = ichorDir + "HD_ULP_PoN_hs37d5_1Mb_median_normAutosome_median.rds", + hs37d5_500kb_rds = ichorDir + "HD_ULP_PoN_hs37d5_500kb_normAutosome_median.rds", + grch38_1Mb_rds = ichorDir + "HD_ULP_PoN_grch38_1Mb_median_normAutosome_median.rds", + grch38_500kb_rds = ichorDir + "HD_ULP_PoN_grch38_500kb_median_normAutosome_median.rds", + grch37_1000kb_gc = ichorDir + "gc_grch37_1000kb.wig", + grch37_500kb_gc = ichorDir + "gc_grch37_500kb.wig", + grch37_50kb_gc = ichorDir + "gc_grch37_50kb.wig", + grch37_10kb_gc = ichorDir + "gc_grch37_10kb.wig", + hs37d5_1000kb_gc = ichorDir + "gc_hs37d5_1000kb.wig", + hs37d5_500kb_gc = ichorDir + "gc_hs37d5_500kb.wig", + hs37d5_50kb_gc = ichorDir + "gc_hs37d5_50kb.wig", + hs37d5_10kb_gc = ichorDir + "gc_hs37d5_10kb.wig", + grch38_1000kb_gc = ichorDir + "gc_grch38_1000kb.wig", + grch38_500kb_gc = ichorDir + "gc_grch38_500kb.wig", + grch38_50kb_gc = ichorDir + "gc_grch38_50kb.wig", + grch38_10kb_gc = ichorDir + "gc_grch38_10kb.wig", + grch37_1000kb_map = ichorDir + "map_grch37_1000kb.wig", + grch37_500kb_map = ichorDir + "map_grch37_500kb.wig", + grch37_50kb_map = ichorDir + "map_grch37_50kb.wig", + grch37_10kb_map = ichorDir + "map_grch37_10kb.wig", + hs37d5_1000kb_map = ichorDir + "map_hs37d5_1000kb.wig", + hs37d5_500kb_map = ichorDir + "map_hs37d5_500kb.wig", + hs37d5_50kb_map = ichorDir + "map_hs37d5_50kb.wig", + hs37d5_10kb_map = ichorDir + "map_hs37d5_10kb.wig", + grch38_1000kb_map = ichorDir + "map_grch38_1000kb.wig", + grch38_500kb_map = ichorDir + "map_grch38_500kb.wig", + grch38_50kb_map = ichorDir + "map_grch38_50kb.wig", + grch38_10kb_map = ichorDir + "map_grch38_10kb.wig", + complete = touch(ichorDir + "symlink.done") + run: + op.relative_symlink(params.hg19_1Mb_rds, output.hg19_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hg19_500kb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.grch37_1Mb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.hs37d5_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.grch37_500kb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hs37d5_500kb_rds) + op.relative_symlink(params.hg38_1Mb_rds, output.grch38_1Mb_rds) + op.relative_symlink(params.hg38_500kb_rds, output.grch38_500kb_rds) + op.relative_symlink(params.hg19_1000kb_gc, output.grch37_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.grch37_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.grch37_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.grch37_10kb_gc) + op.relative_symlink(params.hg19_1000kb_gc, output.hs37d5_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.hs37d5_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.hs37d5_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.hs37d5_10kb_gc) + op.relative_symlink(params.hg38_1000kb_gc, output.grch38_1000kb_gc) + op.relative_symlink(params.hg38_500kb_gc, output.grch38_500kb_gc) + op.relative_symlink(params.hg38_50kb_gc, output.grch38_50kb_gc) + op.relative_symlink(params.hg38_10kb_gc, output.grch38_10kb_gc) + op.relative_symlink(params.hg19_1000kb_map, output.grch37_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.grch37_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.grch37_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.grch37_10kb_map) + op.relative_symlink(params.hg19_1000kb_map, output.hs37d5_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.hs37d5_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.hs37d5_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.hs37d5_10kb_map) + op.relative_symlink(params.hg38_1000kb_map, output.grch38_1000kb_map) + op.relative_symlink(params.hg38_500kb_map, output.grch38_500kb_map) + op.relative_symlink(params.hg38_50kb_map, output.grch38_50kb_map) + op.relative_symlink(params.hg38_10kb_map, output.grch38_10kb_map) + +### Run ichorCNA ### +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _ichorcna_input_bam: + input: + bam = CFG["inputs"]["sample_bam"], + bai = CFG["inputs"]["sample_bai"] + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", # specific to readCounter + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + +# This function will return a comma-separated list of chromosomes to include in readCounter +def get_chromosomes(wildcards): + chromosomes=[] + for i in range(1,23): + chromosomes.append(str(i)) + chromosomes.append("X") + chromosomes.append("Y") + if "38" in str(wildcards.genome_build): + chromosomes = ["chr" + x for x in chromosomes] + chromosomes= ",".join(chromosomes) + return chromosomes + +rule _ichorcna_read_counter: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + ichorcna_package = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success", + symlink_complete = ichorDir + "symlink.done" + output: + CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/{sample_id}.bin{binSize}.wig" + params: + binSize = CFG["options"]["readcounter"]["binSize"], + qual = CFG["options"]["readcounter"]["qual"], + chrs = get_chromosomes + conda: CFG["conda_envs"]["ichorcna"] + threads: CFG["threads"]["readcounter"] + resources: + **CFG["resources"]["readcounter"] + log: + CFG["logs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/{sample_id}.bin{binSize}.log" + shell: + "readCounter {input.bam} -c {params.chrs} -w {params.binSize} -q {params.qual} > {output} 2> {log}" + + +# This function will return a comma-separated list of chromosomes to include in readCounter +def get_chromosomes_R(wildcards): + chromosomesR=[] + stringStart="c('" + for i in range(1,23): + chromosomesR.append(str(i)) + chromosomesR.append("X") + chromosomesR.append("Y") + if "38" in str(wildcards.genome_build): + chromosomesR = ["chr" + x for x in chromosomesR] + chromosomesR= "','".join(chromosomesR) + stringEnd="')" + return stringStart + chromosomesR + stringEnd + +rule _run_ichorcna: + input: + tum = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}.bin{binSize}.wig", + output: + corrDepth = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.correctedDepth.txt", + param = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.params.txt", + cna = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.cna.seg", + segTxt = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.seg.txt", + seg = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.seg", + plot = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}/{tumour_id}_genomeWide.pdf", + params: + ichorDir = CFG["dirs"]["inputs"] + "ichorCNA/", + outDir = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/", + rscript = CFG["options"]["run"]["ichorCNA_rscript"], + name = "{tumour_id}", + ploidy = CFG["options"]["run"]["ichorCNA_ploidy"], + normal = CFG["options"]["run"]["ichorCNA_normal"], + gcwig = op.switch_on_wildcard("binSize", CFG["options"]["run"]["ichorCNA_gcWig"]), + mapwig = op.switch_on_wildcard("binSize", CFG["options"]["run"]["ichorCNA_mapWig"]), + normalpanel = op.switch_on_wildcard("binSize", CFG["options"]["run"]["ichorCNA_normalPanel"]), + estimateNormal = CFG["options"]["run"]["ichorCNA_estimateNormal"], + estimatePloidy = CFG["options"]["run"]["ichorCNA_estimatePloidy"], + estimateClonality = CFG["options"]["run"]["ichorCNA_estimateClonality"], + scStates = CFG["options"]["run"]["ichorCNA_scStates"], + maxCN = CFG["options"]["run"]["ichorCNA_maxCN"], + includeHOMD = CFG["options"]["run"]["ichorCNA_includeHOMD"], + chrs = get_chromosomes_R, + chrTrain = op.switch_on_wildcard("genome_build", CFG["options"]["run"]["ichorCNA_chrTrain"]), + genomeBuild = "{genome_build}", + genomeStyle = op.switch_on_wildcard("genome_build", CFG["options"]["run"]["ichorCNA_genomeStyle"]), + centromere = op.switch_on_wildcard("genome_build", CFG["options"]["run"]["ichorCNA_centromere"]), + fracReadsChrYMale = CFG["options"]["run"]["ichorCNA_fracReadsInChrYForMale"], + minMapScore = CFG["options"]["run"]["ichorCNA_minMapScore"], + maxFracGenomeSubclone = CFG["options"]["run"]["ichorCNA_maxFracGenomeSubclone"], + maxFracCNASubclone = CFG["options"]["run"]["ichorCNA_maxFracCNASubclone"], + exons = CFG["options"]["run"]["ichorCNA_exons"], + txnE = CFG["options"]["run"]["ichorCNA_txnE"], + txnStrength = CFG["options"]["run"]["ichorCNA_txnStrength"], + plotFileType = CFG["options"]["run"]["ichorCNA_plotFileType"], + plotYlim = CFG["options"]["run"]["ichorCNA_plotYlim"], + libdir = CFG["dirs"]["inputs"] + "ichorCNA/" + CFG["options"]["run"]["ichorCNA_libdir"] + conda: CFG["conda_envs"]["ichorcna"] + threads: CFG["threads"]["run"] + resources: + **CFG["resources"]["run"] + log: + stdout = CFG["logs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}.stdout.log", + stderr = CFG["logs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}.stderr.log" + shell: + "Rscript {params.rscript} --id {params.name} --libdir {params.libdir} --WIG {input.tum} --gcWig {params.ichorDir}{params.gcwig} --mapWig {params.ichorDir}{params.mapwig} --normalPanel {params.ichorDir}{params.normalpanel} --ploidy \"{params.ploidy}\" --normal \"{params.normal}\" --maxCN {params.maxCN} --includeHOMD {params.includeHOMD} --chrs \"{params.chrs}\" --chrTrain \"{params.chrTrain}\" --genomeStyle {params.genomeStyle} --genomeBuild {params.genomeBuild} --estimateNormal {params.estimateNormal} --estimatePloidy {params.estimatePloidy} --estimateScPrevalence {params.estimateClonality} --scStates \"{params.scStates}\" --centromere {params.ichorDir}{params.centromere} --exons.bed {params.exons} --txnE {params.txnE} --txnStrength {params.txnStrength} --minMapScore {params.minMapScore} --fracReadsInChrYForMale {params.fracReadsChrYMale} --maxFracGenomeSubclone {params.maxFracGenomeSubclone} --maxFracCNASubclone {params.maxFracCNASubclone} --plotFileType {params.plotFileType} --plotYLim \"{params.plotYlim}\" --outDir {params.outDir} > {log.stdout} 2> {log.stderr}" + +# Symlinks the final output files into the module results directory (under '99-outputs/') +rule _ichorcna_output: + input: + corrDepth = str(rules._run_ichorcna.output.corrDepth), + param = str(rules._run_ichorcna.output.param), + cna = str(rules._run_ichorcna.output.cna), + segTxt = str(rules._run_ichorcna.output.segTxt), + seg = str(rules._run_ichorcna.output.seg), + plot = str(rules._run_ichorcna.output.plot) + output: + corrDepth = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/corrDepth/{binSize}/{tumour_id}--{normal_id}--{pair_status}.corrDepth.txt", + param = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/param/{binSize}/{tumour_id}--{normal_id}--{pair_status}.param.txt", + cna = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/binCNA/{binSize}/{tumour_id}--{normal_id}--{pair_status}.cna.seg", + segTxt = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/seg_txt/{binSize}/{tumour_id}--{normal_id}--{pair_status}.seg.txt", + seg = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/seg/{binSize}/{tumour_id}--{normal_id}--{pair_status}.seg", + plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/plot/{binSize}/{tumour_id}--{normal_id}--{pair_status}_genomeWide.pdf" + run: + op.relative_symlink(input.corrDepth, output.corrDepth, in_module=True) + op.relative_symlink(input.param, output.param, in_module=True) + op.relative_symlink(input.cna, output.cna, in_module=True) + op.relative_symlink(input.segTxt, output.segTxt, in_module=True) + op.relative_symlink(input.seg, output.seg, in_module=True) + op.relative_symlink(input.plot, output.plot, in_module=True) + +# Generates the target sentinels for each run, which generate the symlinks +rule _ichorcna_all: + input: + expand( + [ + str(rules._ichorcna_output.output.corrDepth), + str(rules._ichorcna_output.output.param), + str(rules._ichorcna_output.output.cna), + str(rules._ichorcna_output.output.segTxt), + str(rules._ichorcna_output.output.seg), + str(rules._ichorcna_output.output.plot) + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + pair_status=CFG["runs"]["pair_status"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_id=CFG["runs"]["normal_sample_id"], + binSize=[CFG["options"]["readcounter"]["binSize"]] * len(CFG["runs"]["tumour_sample_id"])) + + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/ichorcna/1.0/schemas/base-1.0.yaml b/modules/ichorcna/1.0/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/ichorcna/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/ichorcna/1.0/src/runIchorCNA.R b/modules/ichorcna/1.0/src/runIchorCNA.R new file mode 100755 index 00000000..ec1d7384 --- /dev/null +++ b/modules/ichorcna/1.0/src/runIchorCNA.R @@ -0,0 +1,423 @@ +# file: ichorCNA.R +# authors: Gavin Ha, Ph.D. +# Fred Hutch +# contact: <gha@fredhutch.org> +# +# Justin Rhoades +# Broad Institute +# contact: <rhoades@broadinstitute.org> + +# ichorCNA: https://github.com/broadinstitute/ichorCNA +# date: July 24, 2019 +# description: Hidden Markov model (HMM) to analyze Ultra-low pass whole genome sequencing (ULP-WGS) data. +# This script is the main script to run the HMM. + +library(optparse) + +option_list <- list( + make_option(c("--WIG"), type = "character", help = "Path to tumor WIG file. Required."), + make_option(c("--NORMWIG"), type = "character", default=NULL, help = "Path to normal WIG file. Default: [%default]"), + make_option(c("--gcWig"), type = "character", help = "Path to GC-content WIG file; Required"), + make_option(c("--mapWig"), type = "character", default=NULL, help = "Path to mappability score WIG file. Default: [%default]"), + make_option(c("--normalPanel"), type="character", default=NULL, help="Median corrected depth from panel of normals. Default: [%default]"), + make_option(c("--exons.bed"), type = "character", default=NULL, help = "Path to bed file containing exon regions. Default: [%default]"), + make_option(c("--id"), type = "character", default="test", help = "Patient ID. Default: [%default]"), + make_option(c("--centromere"), type="character", default=NULL, help = "File containing Centromere locations; if not provided then will use hg19 version from ichorCNA package. Default: [%default]"), + make_option(c("--minMapScore"), type = "numeric", default=0.9, help="Include bins with a minimum mappability score of this value. Default: [%default]."), + make_option(c("--rmCentromereFlankLength"), type="numeric", default=1e5, help="Length of region flanking centromere to remove. Default: [%default]"), + make_option(c("--normal"), type="character", default="0.5", help = "Initial normal contamination; can be more than one value if additional normal initializations are desired. Default: [%default]"), + make_option(c("--scStates"), type="character", default="NULL", help = "Subclonal states to consider. Default: [%default]"), + make_option(c("--coverage"), type="numeric", default=NULL, help = "PICARD sequencing coverage. Default: [%default]"), + make_option(c("--lambda"), type="character", default="NULL", help="Initial Student's t precision; must contain 4 values (e.g. c(1500,1500,1500,1500)); if not provided then will automatically use based on variance of data. Default: [%default]"), + make_option(c("--lambdaScaleHyperParam"), type="numeric", default=3, help="Hyperparameter (scale) for Gamma prior on Student's-t precision. Default: [%default]"), + # make_option(c("--kappa"), type="character", default=50, help="Initial state distribution"), + make_option(c("--ploidy"), type="character", default="2", help = "Initial tumour ploidy; can be more than one value if additional ploidy initializations are desired. Default: [%default]"), + make_option(c("--maxCN"), type="numeric", default=7, help = "Total clonal CN states. Default: [%default]"), + make_option(c("--estimateNormal"), type="logical", default=TRUE, help = "Estimate normal. Default: [%default]"), + make_option(c("--estimateScPrevalence"), type="logical", default=TRUE, help = "Estimate subclonal prevalence. Default: [%default]"), + make_option(c("--estimatePloidy"), type="logical", default=TRUE, help = "Estimate tumour ploidy. Default: [%default]"), + make_option(c("--maxFracCNASubclone"), type="numeric", default=0.7, help="Exclude solutions with fraction of subclonal events greater than this value. Default: [%default]"), + make_option(c("--maxFracGenomeSubclone"), type="numeric", default=0.5, help="Exclude solutions with subclonal genome fraction greater than this value. Default: [%default]"), + make_option(c("--minSegmentBins"), type="numeric", default=50, help="Minimum number of bins for largest segment threshold required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction."), + make_option(c("--altFracThreshold"), type="numeric", default=0.05, help="Minimum proportion of bins altered required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction. Default: [%default]"), + make_option(c("--chrNormalize"), type="character", default="c(1:22)", help = "Specify chromosomes to normalize GC/mappability biases. Default: [%default]"), + make_option(c("--chrTrain"), type="character", default="c(1:22)", help = "Specify chromosomes to estimate params. Default: [%default]"), + make_option(c("--chrs"), type="character", default="c(1:22,\"X\")", help = "Specify chromosomes to analyze. Default: [%default]"), + make_option(c("--genomeBuild"), type="character", default="hg19", help="Geome build. Default: [%default]"), + make_option(c("--genomeStyle"), type = "character", default = "NCBI", help = "NCBI or UCSC chromosome naming convention; use UCSC if desired output is to have \"chr\" string. [Default: %default]"), + make_option(c("--normalizeMaleX"), type="logical", default=TRUE, help = "If male, then normalize chrX by median. Default: [%default]"), + make_option(c("--minTumFracToCorrect"), type="numeric", default=0.1, help = "Tumor-fraction correction of bin and segment-level CNA if sample has minimum estimated tumor fraction. [Default: %default]"), + make_option(c("--fracReadsInChrYForMale"), type="numeric", default=0.001, help = "Threshold for fraction of reads in chrY to assign as male. Default: [%default]"), + make_option(c("--includeHOMD"), type="logical", default=FALSE, help="If FALSE, then exclude HOMD state. Useful when using large bins (e.g. 1Mb). Default: [%default]"), + make_option(c("--txnE"), type="numeric", default=0.9999999, help = "Self-transition probability. Increase to decrease number of segments. Default: [%default]"), + make_option(c("--txnStrength"), type="numeric", default=1e7, help = "Transition pseudo-counts. Exponent should be the same as the number of decimal places of --txnE. Default: [%default]"), + make_option(c("--plotFileType"), type="character", default="pdf", help = "File format for output plots. Default: [%default]"), + make_option(c("--plotYLim"), type="character", default="c(-2,2)", help = "ylim to use for chromosome plots. Default: [%default]"), + make_option(c("--outDir"), type="character", default="./", help = "Output Directory. Default: [%default]"), + make_option(c("--libdir"), type = "character", default=NULL, help = "Script library path. Usually exclude this argument unless custom modifications have been made to the ichorCNA R package code and the user would like to source those R files. Default: [%default]") +) +parseobj <- OptionParser(option_list=option_list) +opt <- parse_args(parseobj) +print(opt) +options(scipen=0, stringsAsFactors=F) + +library(HMMcopy) +library(GenomicRanges) +library(GenomeInfoDb) +options(stringsAsFactors=FALSE) +options(bitmapType='cairo') + +patientID <- opt$id +tumour_file <- opt$WIG +normal_file <- opt$NORMWIG +gcWig <- opt$gcWig +mapWig <- opt$mapWig +normal_panel <- opt$normalPanel +exons.bed <- opt$exons.bed # "0" if none specified +centromere <- opt$centromere +minMapScore <- opt$minMapScore +flankLength <- opt$rmCentromereFlankLength +normal <- eval(parse(text = opt$normal)) +scStates <- eval(parse(text = opt$scStates)) +lambda <- eval(parse(text = opt$lambda)) +lambdaScaleHyperParam <- opt$lambdaScaleHyperParam +estimateNormal <- opt$estimateNormal +estimatePloidy <- opt$estimatePloidy +estimateScPrevalence <- opt$estimateScPrevalence +maxFracCNASubclone <- opt$maxFracCNASubclone +maxFracGenomeSubclone <- opt$maxFracGenomeSubclone +minSegmentBins <- opt$minSegmentBins +altFracThreshold <- opt$altFracThreshold +ploidy <- eval(parse(text = opt$ploidy)) +coverage <- opt$coverage +maxCN <- opt$maxCN +txnE <- opt$txnE +txnStrength <- opt$txnStrength +normalizeMaleX <- as.logical(opt$normalizeMaleX) +includeHOMD <- as.logical(opt$includeHOMD) +minTumFracToCorrect <- opt$minTumFracToCorrect +fracReadsInChrYForMale <- opt$fracReadsInChrYForMale +chrXMedianForMale <- -0.1 +outDir <- opt$outDir +libdir <- opt$libdir +plotFileType <- opt$plotFileType +plotYLim <- eval(parse(text=opt$plotYLim)) +gender <- NULL +outImage <- paste0(outDir,"/", patientID,".RData") +genomeBuild <- opt$genomeBuild +genomeStyle <- opt$genomeStyle +chrs <- as.character(eval(parse(text = opt$chrs))) +chrTrain <- as.character(eval(parse(text=opt$chrTrain))); +chrNormalize <- as.character(eval(parse(text=opt$chrNormalize))); +seqlevelsStyle(chrs) <- genomeStyle +seqlevelsStyle(chrNormalize) <- genomeStyle +seqlevelsStyle(chrTrain) <- genomeStyle + +## load ichorCNA library or source R scripts +if (!is.null(libdir) && libdir != "None"){ + source(paste0(libdir,"/R/utils.R")) + source(paste0(libdir,"/R/segmentation.R")) + source(paste0(libdir,"/R/EM.R")) + source(paste0(libdir,"/R/output.R")) + source(paste0(libdir,"/R/plotting.R")) +} else { + library(ichorCNA) +} + +## load seqinfo +# seqinfo <- getSeqInfo(genomeBuild, genomeStyle) +seqinfo <- NULL + +if (substr(tumour_file,nchar(tumour_file)-2,nchar(tumour_file)) == "wig") { + wigFiles <- data.frame(cbind(patientID, tumour_file)) +} else { + wigFiles <- read.delim(tumour_file, header=F, as.is=T) +} + +## FILTER BY EXONS IF PROVIDED ## +## add gc and map to GRanges object ## +if (is.null(exons.bed) || exons.bed == "None" || exons.bed == "NULL"){ + targetedSequences <- NULL +}else{ + targetedSequences <- read.delim(exons.bed, header=T, sep="\t") +} + +## load PoN +if (is.null(normal_panel) || normal_panel == "None" || normal_panel == "NULL"){ + normal_panel <- NULL +} + +if (is.null(centromere) || centromere == "None" || centromere == "NULL"){ # no centromere file provided + centromere <- system.file("extdata", "GRCh37.p13_centromere_UCSC-gapTable.txt", + package = "ichorCNA") +} +centromere <- read.delim(centromere,header=T,stringsAsFactors=F,sep="\t") +save.image(outImage) +## LOAD IN WIG FILES ## +numSamples <- nrow(wigFiles) + +tumour_copy <- list() +for (i in 1:numSamples) { + id <- wigFiles[i,1] + ## create output directories for each sample ## + dir.create(paste0(outDir, "/", id, "/"), recursive = TRUE) + ### LOAD TUMOUR AND NORMAL FILES ### + message("Loading tumour file:", wigFiles[i,1]) + tumour_reads <- wigToGRanges(wigFiles[i,2]) + + ## LOAD GC/MAP WIG FILES ### + # find the bin size and load corresponding wig files # + binSize <- as.data.frame(tumour_reads[1,])$width + message("Reading GC and mappability files") + if (is.null(gcWig) || gcWig == "None" || gcWig == "NULL"){ + stop("GC wig file is required") + } + gc <- wigToGRanges(gcWig) + if (is.null(mapWig) || mapWig == "None" || mapWig == "NULL"){ + message("No mappability wig file input, excluding from correction") + map <- NULL + } else { + map <- wigToGRanges(mapWig) + } + message("Correcting Tumour") + + counts <- loadReadCountsFromWig(tumour_reads, chrs = chrs, gc = gc, map = map, + centromere = centromere, flankLength = flankLength, + targetedSequences = targetedSequences, chrXMedianForMale = chrXMedianForMale, + genomeStyle = genomeStyle, fracReadsInChrYForMale = fracReadsInChrYForMale, + chrNormalize = chrNormalize, mapScoreThres = minMapScore) + tumour_copy[[id]] <- counts$counts #as(counts$counts, "GRanges") + gender <- counts$gender + ## load in normal file if provided + if (!is.null(normal_file) && normal_file != "None" && normal_file != "NULL"){ + message("Loading normal file:", normal_file) + normal_reads <- wigToGRanges(normal_file) + message("Correcting Normal") + counts <- loadReadCountsFromWig(normal_reads, chrs=chrs, gc=gc, map=map, + centromere=centromere, flankLength = flankLength, targetedSequences=targetedSequences, + genomeStyle = genomeStyle, chrNormalize = chrNormalize, mapScoreThres = minMapScore) + normal_copy <- counts$counts #as(counts$counts, "GRanges") + gender.normal <- counts$gender + }else{ + normal_copy <- NULL + } + + ### DETERMINE GENDER ### + ## if normal file not given, use chrY, else use chrX + message("Determining gender...", appendLF = FALSE) + gender.mismatch <- FALSE + if (!is.null(normal_copy)){ + if (gender$gender != gender.normal$gender){ #use tumour # use normal if given + # check if normal is same gender as tumour + gender.mismatch <- TRUE + } + } + message("Gender ", gender$gender) + + ## NORMALIZE GENOME-WIDE BY MATCHED NORMAL OR NORMAL PANEL (MEDIAN) ## + tumour_copy[[id]] <- normalizeByPanelOrMatchedNormal(tumour_copy[[id]], chrs = chrs, + normal_panel = normal_panel, normal_copy = normal_copy, + gender = gender$gender, normalizeMaleX = normalizeMaleX) + + ### OUTPUT FILE ### + ### PUTTING TOGETHER THE COLUMNS IN THE OUTPUT ### + outMat <- as.data.frame(tumour_copy[[id]]) + #outMat <- outMat[,c(1,2,3,12)] + outMat <- outMat[,c("seqnames","start","end","copy")] + colnames(outMat) <- c("chr","start","end","log2_TNratio_corrected") + outFile <- paste0(outDir,"/",id,".correctedDepth.txt") + message(paste("Outputting to:", outFile)) + write.table(outMat, file=outFile, row.names=F, col.names=T, quote=F, sep="\t") + +} ## end of for each sample + +chrInd <- as.character(seqnames(tumour_copy[[1]])) %in% chrTrain +## get positions that are valid +valid <- tumour_copy[[1]]$valid +if (length(tumour_copy) >= 2) { + for (i in 2:length(tumour_copy)){ + valid <- valid & tumour_copy[[i]]$valid + } +} +save.image(outImage) + +### RUN HMM ### +## store the results for different normal and ploidy solutions ## +ptmTotalSolutions <- proc.time() # start total timer +results <- list() +loglik <- as.data.frame(matrix(NA, nrow = length(normal) * length(ploidy), ncol = 7, + dimnames = list(c(), c("init", "n_est", "phi_est", "BIC", + "Frac_genome_subclonal", "Frac_CNA_subclonal", "loglik")))) +counter <- 1 +compNames <- rep(NA, nrow(loglik)) +mainName <- rep(NA, length(normal) * length(ploidy)) +#### restart for purity and ploidy values #### +for (n in normal){ + for (p in ploidy){ + if (n == 0.95 & p != 2) { + next + } + logR <- as.data.frame(lapply(tumour_copy, function(x) { x$copy })) # NEED TO EXCLUDE CHR X # + param <- getDefaultParameters(logR[valid & chrInd, , drop=F], maxCN = maxCN, includeHOMD = includeHOMD, + ct.sc=scStates, ploidy = floor(p), e=txnE, e.same = 50, strength=txnStrength) + param$phi_0 <- rep(p, numSamples) + param$n_0 <- rep(n, numSamples) + + ############################################ + ######## CUSTOM PARAMETER SETTINGS ######### + ############################################ + # 0.1x cfDNA # + if (is.null(lambda)){ + logR.var <- 1 / ((apply(logR, 2, sd, na.rm = TRUE) / sqrt(length(param$ct))) ^ 2) + param$lambda <- rep(logR.var, length(param$ct)) + param$lambda[param$ct %in% c(2)] <- logR.var + param$lambda[param$ct %in% c(1,3)] <- logR.var + param$lambda[param$ct >= 4] <- logR.var / 5 + param$lambda[param$ct == max(param$ct)] <- logR.var / 15 + param$lambda[param$ct.sc.status] <- logR.var / 10 + }else{ + param$lambda[param$ct %in% c(2)] <- lambda[2] + param$lambda[param$ct %in% c(1)] <- lambda[1] + param$lambda[param$ct %in% c(3)] <- lambda[3] + param$lambda[param$ct >= 4] <- lambda[4] + param$lambda[param$ct == max(param$ct)] <- lambda[2] / 15 + param$lambda[param$ct.sc.status] <- lambda[2] / 10 + } + param$alphaLambda <- rep(lambdaScaleHyperParam, length(param$ct)) + # 1x bulk tumors # + #param$lambda[param$ct %in% c(2)] <- 2000 + #param$lambda[param$ct %in% c(1)] <- 1750 + #param$lambda[param$ct %in% c(3)] <- 1750 + #param$lambda[param$ct >= 4] <- 1500 + #param$lambda[param$ct == max(param$ct)] <- 1000 / 25 + #param$lambda[param$ct.sc.status] <- 1000 / 75 + #param$alphaLambda[param$ct.sc.status] <- 4 + #param$alphaLambda[param$ct %in% c(1,3)] <- 5 + #param$alphaLambda[param$ct %in% c(2)] <- 5 + #param$alphaLambda[param$ct == max(param$ct)] <- 4 + + ############################################# + ################ RUN HMM #################### + ############################################# + hmmResults.cor <- HMMsegment(tumour_copy, valid, dataType = "copy", + param = param, chrTrain = chrTrain, maxiter = 50, + estimateNormal = estimateNormal, estimatePloidy = estimatePloidy, + estimateSubclone = estimateScPrevalence, verbose = TRUE) + + for (s in 1:numSamples){ + iter <- hmmResults.cor$results$iter + id <- names(hmmResults.cor$cna)[s] + + ## convert full diploid solution (of chrs to train) to have 1.0 normal or 0.0 purity + ## check if there is an altered segment that has at least a minimum # of bins + segsS <- hmmResults.cor$results$segs[[s]] + segsS <- segsS[segsS$chr %in% chrTrain, ] + segAltInd <- which(segsS$event != "NEUT") + maxBinLength = -Inf + if (sum(segAltInd) > 0){ + maxInd <- which.max(segsS$end[segAltInd] - segsS$start[segAltInd] + 1) + maxSegRD <- GRanges(seqnames=segsS$chr[segAltInd[maxInd]], + ranges=IRanges(start=segsS$start[segAltInd[maxInd]], end=segsS$end[segAltInd[maxInd]])) + hits <- findOverlaps(query=maxSegRD, subject=tumour_copy[[s]][valid, ]) + maxBinLength <- length(subjectHits(hits)) + } + ## check if there are proportion of total bins altered + # if segment size smaller than minSegmentBins, but altFrac > altFracThreshold, then still estimate TF + cnaS <- hmmResults.cor$cna[[s]] + altInd <- cnaS[cnaS$chr %in% chrTrain, "event"] == "NEUT" + altFrac <- sum(!altInd, na.rm=TRUE) / length(altInd) + if ((maxBinLength <= minSegmentBins) & (altFrac <= altFracThreshold)){ + hmmResults.cor$results$n[s, iter] <- 1.0 + } + + # correct integer copy number based on estimated purity and ploidy + correctedResults <- correctIntegerCN(cn = hmmResults.cor$cna[[s]], + segs = hmmResults.cor$results$segs[[s]], + purity = 1 - hmmResults.cor$results$n[s, iter], ploidy = hmmResults.cor$results$phi[s, iter], + cellPrev = 1 - hmmResults.cor$results$sp[s, iter], + maxCNtoCorrect.autosomes = maxCN, maxCNtoCorrect.X = maxCN, minPurityToCorrect = minTumFracToCorrect, + gender = gender$gender, chrs = chrs, correctHOMD = includeHOMD) + hmmResults.cor$results$segs[[s]] <- correctedResults$segs + hmmResults.cor$cna[[s]] <- correctedResults$cn + + ## plot solution ## + outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n, "-p", p) + mainName[counter] <- paste0(id, ", n: ", n, ", p: ", p, ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4)) + plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType, + logR.column = "logR", call.column = "Corrected_Call", + plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo=seqinfo, main=mainName[counter]) + } + iter <- hmmResults.cor$results$iter + results[[counter]] <- hmmResults.cor + loglik[counter, "loglik"] <- signif(hmmResults.cor$results$loglik[iter], digits = 4) + subClonalBinCount <- unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$subclone.status) })) + fracGenomeSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ nrow(x) })) + fracAltSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$copy.number != 2) })) + fracAltSub <- lapply(fracAltSub, function(x){if (is.na(x)){0}else{x}}) + loglik[counter, "Frac_genome_subclonal"] <- paste0(signif(fracGenomeSub, digits=2), collapse=",") + loglik[counter, "Frac_CNA_subclonal"] <- paste0(signif(as.numeric(fracAltSub), digits=2), collapse=",") + loglik[counter, "init"] <- paste0("n", n, "-p", p) + loglik[counter, "n_est"] <- paste(signif(hmmResults.cor$results$n[, iter], digits = 2), collapse = ",") + loglik[counter, "phi_est"] <- paste(signif(hmmResults.cor$results$phi[, iter], digits = 4), collapse = ",") + + counter <- counter + 1 + } +} +## get total time for all solutions ## +elapsedTimeSolutions <- proc.time() - ptmTotalSolutions +message("Total ULP-WGS HMM Runtime: ", format(elapsedTimeSolutions[3] / 60, digits = 2), " min.") + +### SAVE R IMAGE ### +save.image(outImage) +#save(tumour_copy, results, loglik, file=paste0(outDir,"/",id,".RData")) + +### SELECT SOLUTION WITH LARGEST LIKELIHOOD ### +loglik <- loglik[!is.na(loglik$init), ] +if (estimateScPrevalence){ ## sort but excluding solutions with too large % subclonal + fracInd <- which(loglik[, "Frac_CNA_subclonal"] <= maxFracCNASubclone & + loglik[, "Frac_genome_subclonal"] <= maxFracGenomeSubclone) + if (length(fracInd) > 0){ ## if there is a solution satisfying % subclonal + ind <- fracInd[order(loglik[fracInd, "loglik"], decreasing=TRUE)] + }else{ # otherwise just take largest likelihood + ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE) + } +}else{#sort by likelihood only + ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE) +} + +#new loop by order of solutions (ind) +outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_all_sols") +for(i in 1:length(ind)) { + hmmResults.cor <- results[[ind[i]]] + turnDevOff <- FALSE + turnDevOn <- FALSE + if (i == 1){ + turnDevOn <- TRUE + } + if (i == length(ind)){ + turnDevOff <- TRUE + } + plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType="pdf", + logR.column = "logR", call.column = "Corrected_Call", + plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, + seqinfo = seqinfo, + turnDevOn = turnDevOn, turnDevOff = turnDevOff, main=mainName[ind[i]]) +} + +hmmResults.cor <- results[[ind[1]]] +hmmResults.cor$results$loglik <- as.data.frame(loglik) +hmmResults.cor$results$gender <- gender$gender +hmmResults.cor$results$chrYCov <- gender$chrYCovRatio +hmmResults.cor$results$chrXMedian <- gender$chrXMedian +hmmResults.cor$results$coverage <- coverage + +outputHMM(cna = hmmResults.cor$cna, segs = hmmResults.cor$results$segs, + results = hmmResults.cor$results, patientID = patientID, outDir=outDir) +outFile <- paste0(outDir, "/", patientID, ".params.txt") +outputParametersToFile(hmmResults.cor, file = outFile) + +## plot solutions for all samples +plotSolutions(hmmResults.cor, tumour_copy, chrs, outDir, numSamples=numSamples, + logR.column = "logR", call.column = "Corrected_Call", + plotFileType=plotFileType, plotYLim=plotYLim, seqinfo = seqinfo, + estimateScPrevalence=estimateScPrevalence, maxCN=maxCN) \ No newline at end of file diff --git a/modules/ichorcna/CHANGELOG.md b/modules/ichorcna/CHANGELOG.md new file mode 100755 index 00000000..2a9de048 --- /dev/null +++ b/modules/ichorcna/CHANGELOG.md @@ -0,0 +1,23 @@ +# Changelog + +All notable changes to the `ichorcna` module will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0] - 2021-03-31 + +This release was authored by jawong. + +IchorCNA requires indices to be ".bam.bai" format. Modified the initial symlink rule appropriately. +Modified runIchorCNA.R - line 128: seqinfo <- NULL - to allow for flexibility in genome builds (i.e. hs37d5). + +IchorCNA is currently set up to run in unpaired mode using a panel of normals. It currently does not work with CRAM files. + +The output files in 99-outputs include: +- .cna.seg - per-bin CNA state and log ratio +- .seg - segments called by the Viterbi algorithm - IGV compatible +- .segTxt - same as .seg but also includes subclonal status of segments (0 = clonal; 1 = subclonal) - not IGV compatible +- .genomeWide.pdf - CNA profile +- .param.txt - parameters including ploidy, tumour fraction, sex +- .corrDepth.txt - per-bin size corrected depth \ No newline at end of file