diff --git a/demo/genome_Snakefile.smk b/demo/genome_Snakefile.smk index 13928f35..7d647e27 100755 --- a/demo/genome_Snakefile.smk +++ b/demo/genome_Snakefile.smk @@ -36,6 +36,8 @@ configfile: "../modules/sequenza/1.4/config/default.yaml" configfile: "../modules/bwa_mem/1.1/config/default.yaml" configfile: "../modules/controlfreec/1.2/config/default.yaml" configfile: "../modules/slms_3/1.0/config/default.yaml" +configfile: "../modules/ichorcna/1.0/config/default.yaml" +configfile: "../modules/gatk_rnaseq/1.0/config/default.yaml" configfile: "../modules/gridss/1.1/config/default.yaml" configfile: "../modules/liftover/1.2/config/default.yaml" configfile: "../modules/battenberg/1.2/config/default.yaml" @@ -66,6 +68,13 @@ include: "../modules/bwa_mem/1.1/bwa_mem.smk" include: "../modules/controlfreec/1.2/controlfreec.smk" include: "../modules/slms_3/1.0/slms_3.smk" include: "../modules/gridss/1.1/gridss.smk" +include: "../modules/bam2fastq/1.2/bam2fastq.smk" +include: "../modules/controlfreec/1.1/controlfreec.smk" +include: "../modules/lofreq/1.0/lofreq.smk" +include: "../modules/starfish/2.0/starfish.smk" +include: "../modules/sage/1.0/sage.smk" +include: "../modules/ichorcna/1.0/ichorcna.smk" +include: "../modules/gatk_rnaseq/1.0/gatk_rnaseq.smk" include: "../modules/liftover/1.2/liftover.smk" include: "../modules/battenberg/1.2/battenberg.smk" include: "../modules/pathseq/1.0/pathseq.smk" @@ -82,6 +91,9 @@ rule all: rules._bwa_mem_all.input, rules._controlfreec_all.input, rules._slms_3_all.input, + rules._ichorcna_all.input, + rules._gatk_rnaseq_all.input + rules._gridss_all.input, rules._liftover_all.input, rules._battenberg_all.input, diff --git a/envs/cnvkit/cnvkit-0.9.9.yaml b/envs/cnvkit/cnvkit-0.9.9.yaml new file mode 100644 index 00000000..4e652f96 --- /dev/null +++ b/envs/cnvkit/cnvkit-0.9.9.yaml @@ -0,0 +1,146 @@ +name: null +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.36.1 + - binutils_linux-64=2.36 + - bioconductor-dnacopy=1.68.0 + - biopython=1.79 + - brotli=1.0.9 + - brotli-bin=1.0.9 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.18.1 + - ca-certificates=2021.10.8 + - cairo=1.16.0 + - certifi=2021.10.8 + - cnvkit=0.9.9 + - curl=7.81.0 + - cycler=0.11.0 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.94 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - fonttools=4.29.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.4.0 + - gcc_linux-64=9.4.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.4.0 + - gfortran_linux-64=9.4.0 + - graphite2=1.3.13 + - gsl=2.7 + - gxx_impl_linux-64=9.4.0 + - gxx_linux-64=9.4.0 + - harfbuzz=3.3.1 + - icu=69.1 + - jbig=2.1 + - joblib=0.17.0 + - jpeg=9e + - kernel-headers_linux-64=2.6.32 + - kiwisolver=1.3.2 + - krb5=1.19.2 + - lcms2=2.12 + - ld_impl_linux-64=2.36.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libbrotlicommon=1.0.9 + - libbrotlidec=1.0.9 + - libbrotlienc=1.0.9 + - libcblas=3.9.0 + - libcurl=7.81.0 + - libdeflate=1.7 + - libedit=3.1.20191231 + - libev=4.33 + - libffi=3.4.2 + - libgcc-devel_linux-64=9.4.0 + - libgcc-ng=11.2.0 + - libgfortran-ng=11.2.0 + - libgfortran5=11.2.0 + - libglib=2.70.2 + - libgomp=11.2.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.46.0 + - libnsl=2.0.0 + - libopenblas=0.3.18 + - libpng=1.6.37 + - libsanitizer=9.4.0 + - libssh2=1.10.0 + - libstdcxx-devel_linux-64=9.4.0 + - libstdcxx-ng=11.2.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.2 + - libxcb=1.13 + - libxml2=2.9.12 + - libzlib=1.2.11 + - lz4-c=1.9.3 + - make=4.3 + - matplotlib-base=3.5.1 + - munkres=1.1.4 + - ncurses=6.3 + - networkx=2.6.3 + - numpy=1.22.1 + - olefile=0.46 + - openjpeg=2.4.0 + - openssl=1.1.1l + - packaging=21.3 + - pandas=1.4.0 + - pango=1.48.10 + - pcre=8.45 + - pcre2=10.37 + - pillow=8.4.0 + - pip=22.0.2 + - pixman=0.40.0 + - pomegranate=0.13.3 + - pthread-stubs=0.4 + - pyfaidx=0.6.4 + - pyparsing=3.0.7 + - pysam=0.17.0 + - python=3.9.10 + - python-dateutil=2.8.2 + - python_abi=3.9 + - pytz=2021.3 + - pyyaml=6.0 + - r-base=4.1.2 + - r-cghflasso=0.2_1 + - readline=8.1 + - reportlab=3.5.68 + - scipy=1.7.3 + - sed=4.8 + - setuptools=60.6.0 + - six=1.16.0 + - sqlite=3.37.0 + - sysroot_linux-64=2.12 + - tk=8.6.11 + - tktable=2.10 + - tzdata=2021e + - unicodedata2=14.0.0 + - wheel=0.37.1 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - yaml=0.2.5 + - zlib=1.2.11 + - zstd=1.5.2 +prefix: /projects/rmorin/projects/tumour_contam/envs/cnvkit diff --git a/modules/gatk_rnaseq/1.0/config/default.yaml b/modules/gatk_rnaseq/1.0/config/default.yaml new file mode 100644 index 00000000..93f6e5f7 --- /dev/null +++ b/modules/gatk_rnaseq/1.0/config/default.yaml @@ -0,0 +1,84 @@ +lcr-modules: + + gatk_rnaseq: + + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + + + scratch_subdirectories: [] + + options: + java_opts: "-XX:ConcGCThreads=1" + gatk_splitntrim: " -fixNDN TRUE -RF GoodCigarReadFilter " + gatk_addRG: + platform: "illumina" + unit: "unit1" + stringency: "LENIENT" + gatk_baserecalibrator: "" + gatk_applybqsr: "" + gatk_variant_calling: + min_conf_thres: 20.0 + gatk_opts: "" + gatk_variant_filtration: + window: 35 # window size between SNPs in cluster + cluster_size: 3 # at least 3 SNPs in cluster + # hard filtering (filters OUT) based on metrics: + # FS (FisherStrand): Phred-scale probability that there is a strand bias from a Fisher's test. (default FS > 30.0) + # QD (QualByDepth): variant confidence/unfiltered depth (normalizes variant quality to avoid inflation from deep coverage) (default QD < 2.0) + # DP (depth): minimum depth (default DP < 5.0) + filter_expression: "-filter-expression \"FS > 30.0\" -filter-name FS -filter-expression \"QD < 2.0\" -filter-name QD -filter-expression \"DP < 5.0\" -filter-name DP" + gatk_rnaseq_filter_passed: + params: "-f '.,PASS' " + # Can be modified to filter on additional criteria using bcftools view syntax + # For example, to remove all variants with -log10(POPAF) > 4.0: + #"-f '.,PASS' -i 'INFO/POPAF > 4'" + + + conda_envs: + picard: "{MODSDIR}/envs/picard-2.22.3.yaml" + gatk_rnaseq: "{MODSDIR}/envs/gatk-4.1.8.1.yaml" + bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml" + + threads: + gatk_splitntrim: 12 + gatk_addRG: 12 + gatk_base_recalibration: 12 + gatk_applybqsr: 12 + gatk_variant_calling: 24 + gatk_variant_filtration: 24 + merge_vcfs: 10 + gatk_rnaseq_passed: 10 + gnomad_filter: 4 + + resources: + gatk_splitntrim: + mem_mb: 48000 + bam: 1 + gatk_addRG: + mem_mb: 12000 + gatk_base_recalibration: + mem_mb: 12000 + gatk_applybqsr: + mem_mb: 12000 + gatk_variant_calling: + mem_mb: 12000 + gatk_variant_filtration: + mem_mb: 12000 + merge_vcfs: + mem_mb: 10000 + gatk_rnaseq_passed: + mem_mb: 10000 + gnomad_filter: + mem_mb: 2000 + + pairing_config: + mrna: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: True + + + diff --git a/modules/gatk_rnaseq/1.0/envs/bcftools-1.10.2.yaml b/modules/gatk_rnaseq/1.0/envs/bcftools-1.10.2.yaml new file mode 120000 index 00000000..72959e7b --- /dev/null +++ b/modules/gatk_rnaseq/1.0/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/gatk_rnaseq/1.0/envs/gatk-4.1.8.1.yaml b/modules/gatk_rnaseq/1.0/envs/gatk-4.1.8.1.yaml new file mode 120000 index 00000000..fd2445cd --- /dev/null +++ b/modules/gatk_rnaseq/1.0/envs/gatk-4.1.8.1.yaml @@ -0,0 +1 @@ +../../../../envs/gatk/gatk-4.1.8.1.yaml \ No newline at end of file diff --git a/modules/gatk_rnaseq/1.0/envs/picard-2.22.3.yaml b/modules/gatk_rnaseq/1.0/envs/picard-2.22.3.yaml new file mode 120000 index 00000000..8bcf2e66 --- /dev/null +++ b/modules/gatk_rnaseq/1.0/envs/picard-2.22.3.yaml @@ -0,0 +1 @@ +../../../../envs/picard/picard-2.22.3.yaml \ No newline at end of file diff --git a/modules/gatk_rnaseq/1.0/gatk_rnaseq.smk b/modules/gatk_rnaseq/1.0/gatk_rnaseq.smk new file mode 100644 index 00000000..45e1dae9 --- /dev/null +++ b/modules/gatk_rnaseq/1.0/gatk_rnaseq.smk @@ -0,0 +1,454 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: Nicole Thomas +# Module Author: Jasper Wong +# Contributors: N/A + + +##### SETUP ##### + + +# Import package with useful functions for developing analysis modules +import oncopipe as op + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["gatk_rnaseq"]` +CFG = op.setup_module( + name = "gatk_rnaseq", + version = "1.0", + subdirectories = ["inputs", "gatk_splitntrim", "base_recal_report", "gatk_applybqsr", "gatk_variant_calling", "merge_vcfs", "gatk_variant_filtration", "passed", "gnomad_filter", "outputs"], +) + +# Define rules to be run locally when using a compute cluster +localrules: + _gatk_rnaseq_input_bam, + _gatk_rnaseq_output_vcf, + _gatk_rnaseq_all, + + +##### RULES ##### + + +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _gatk_rnaseq_input_bam: + input: + bam = CFG["inputs"]["sample_bam"], + bai = CFG["inputs"]["sample_bai"] + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai", + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + op.absolute_symlink(input.bai, output.crai) + + +rule _gatk_splitntrim: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: + bam = temp(CFG["dirs"]["gatk_splitntrim"] + "bam/{seq_type}--{genome_build}/{sample_id}.split_reassign_mq.bam"), + bai = temp(CFG["dirs"]["gatk_splitntrim"] + "bam/{seq_type}--{genome_build}/{sample_id}.split_reassign_mq.bai") + log: + stdout = CFG["logs"]["gatk_splitntrim"] + "{seq_type}--{genome_build}/{sample_id}.gatk_splitntrim.stdout.log", + stderr = CFG["logs"]["gatk_splitntrim"] + "{seq_type}--{genome_build}/{sample_id}.gatk_splitntrim.stderr.log" + params: + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), + opts = CFG["options"]["java_opts"], + gatk_opts = CFG["options"]["gatk_splitntrim"] + conda: + CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" + threads: + CFG["threads"]["gatk_splitntrim"] + resources: + **CFG["resources"]["gatk_splitntrim"] + shell: + op.as_one_line(""" + gatk --java-options "-Xmx{params.mem_mb}m {params.opts}" SplitNCigarReads {params.gatk_opts} + -R {input.fasta} -I {input.bam} -O {output.bam} + > {log.stdout} 2> {log.stderr} + """) + +rule _gatk_addRG: + input: + bam = str(rules._gatk_splitntrim.output.bam), + bai = str(rules._gatk_splitntrim.output.bai) + output: + bam = temp(CFG["dirs"]["gatk_splitntrim"] + "bam_withRG/{seq_type}--{genome_build}/{sample_id}.withRG.bam"), + bai = temp(CFG["dirs"]["gatk_splitntrim"] + "bam_withRG/{seq_type}--{genome_build}/{sample_id}.withRG.bam.bai") + params: + sampleName = "{sample_id}", + platform = CFG["options"]["gatk_addRG"]["platform"], + unit = CFG["options"]["gatk_addRG"]["unit"], + stringency = CFG["options"]["gatk_addRG"]["stringency"] + conda: + CFG["conda_envs"]["picard"] + group: "split_bam" + log: + stdout = CFG["logs"]["gatk_splitntrim"] + "bam_withRG/{seq_type}--{genome_build}/{sample_id}.addRG.stdout.log" + threads: + CFG["threads"]["gatk_addRG"] + resources: + **CFG["resources"]["gatk_addRG"] + shell: + op.as_one_line(""" + picard AddOrReplaceReadGroups VALIDATION_STRINGENCY={params.stringency} I={input.bam} O={output.bam} RGLB={params.sampleName} RGPL={params.platform} RGPU={params.unit} RGSM={params.sampleName} > {log.stdout} 2>&1 && + samtools index {output.bam} + """) + +rule _gatk_base_recalibration: + input: + bam = str(rules._gatk_addRG.output.bam), + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: + table = CFG["dirs"]["base_recal_report"] + "{seq_type}--{genome_build}/{sample_id}.recalibration_report.grp" + log: + stdout = CFG["logs"]["base_recal_report"] + "{seq_type}--{genome_build}/{sample_id}.gatk_base_recal.stdout.log", + stderr = CFG["logs"]["base_recal_report"] + "{seq_type}--{genome_build}/{sample_id}.gatk_base_recal.stderr.log" + params: + dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), + gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz"), + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), + opts = CFG["options"]["java_opts"], + gatk_opts = CFG["options"]["gatk_baserecalibrator"] + conda: + CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" + threads: CFG["threads"]["gatk_base_recalibration"] + resources: + **CFG["resources"]["gatk_base_recalibration"] + shell: # -known-sites {params.gnomad} + op.as_one_line(""" + gatk --java-options "-Xmx{params.mem_mb}m {params.opts}" BaseRecalibrator {params.gatk_opts} + -R {input.fasta} -I {input.bam} -known-sites {params.dbsnp} -O {output.table} > {log.stdout} 2> {log.stderr} + """) + +rule _gatk_applybqsr: + input: + bam = str(rules._gatk_addRG.output.bam), + bai = str(rules._gatk_addRG.output.bai), + table = str(rules._gatk_base_recalibration.output.table), + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: + # bam = temp(CFG["dirs"]["gatk_applybqsr"] + "{seq_type}--{genome_build}/{sample_id}.recalibrated.bam"), + # bai = temp(CFG["dirs"]["gatk_applybqsr"] + "{seq_type}--{genome_build}/{sample_id}.recalibrated.bai") + bam = CFG["dirs"]["gatk_applybqsr"] + "{seq_type}--{genome_build}/{sample_id}.recalibrated.bam", + bai = CFG["dirs"]["gatk_applybqsr"] + "{seq_type}--{genome_build}/{sample_id}.recalibrated.bai" + log: + stdout = CFG["logs"]["gatk_applybqsr"] + "{seq_type}--{genome_build}/{sample_id}.gatk_base_recal.stdout.log", + stderr = CFG["logs"]["gatk_applybqsr"] + "{seq_type}--{genome_build}/{sample_id}.gatk_base_recal.stderr.log" + params: + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), + opts = CFG["options"]["java_opts"], + gatk_opts = CFG["options"]["gatk_applybqsr"] + conda: + CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" + threads: CFG["threads"]["gatk_applybqsr"] + resources: + **CFG["resources"]["gatk_applybqsr"] + shell: + op.as_one_line(""" + gatk --java-options "-Xmx{params.mem_mb}m {params.opts}" ApplyBQSR {params.gatk_opts} + -R {input.fasta} -I {input.bam} -bqsr-recal-file {input.table} -O {output.bam} > {log.stdout} 2> {log.stderr} + """) + + +rule _gatk_variant_calling: + input: + bam = str(rules._gatk_applybqsr.output.bam), + bai = str(rules._gatk_applybqsr.output.bai), + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: + vcf = temp(CFG["dirs"]["gatk_variant_calling"] + "{seq_type}--{genome_build}/{sample_id}.{chrom}.vcf.gz"), + tbi = temp(CFG["dirs"]["gatk_variant_calling"] + "{seq_type}--{genome_build}/{sample_id}.{chrom}.vcf.gz.tbi") + log: + stdout = CFG["logs"]["gatk_variant_calling"] + "{seq_type}--{genome_build}/{sample_id}.{chrom}.gatk_base_recal.stdout.log", + stderr = CFG["logs"]["gatk_variant_calling"] + "{seq_type}--{genome_build}/{sample_id}.{chrom}.gatk_base_recal.stderr.log" + params: + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), + opts = CFG["options"]["java_opts"], + vcf = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), + min_conf_thres = CFG["options"]["gatk_variant_calling"]["min_conf_thres"], + gatk_opts = CFG["options"]["gatk_variant_calling"]["gatk_opts"] + conda: + CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" + threads: CFG["threads"]["gatk_variant_calling"] + resources: + **CFG["resources"]["gatk_variant_calling"] + shell: + op.as_one_line(""" + gatk --java-options "-Xmx{params.mem_mb}m {params.opts}" HaplotypeCaller {params.gatk_opts} + -R {input.fasta} -I {input.bam} -dont-use-soft-clipped-bases -stand-call-conf {params.min_conf_thres} -L {wildcards.chrom} + -dbsnp {params.vcf} -O {output.vcf} > {log.stdout} 2> {log.stderr} + """) + + +def _gatk_rnaseq_get_chr_vcfs(wildcards): + CFG = config["lcr-modules"]["gatk_rnaseq"] + chrs = reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.txt") + with open(chrs) as file: + chrs = file.read().rstrip("\n").split("\n") + vcfs = expand( + CFG["dirs"]["gatk_variant_calling"] + "{{seq_type}}--{{genome_build}}/{{sample_id}}.{chrom}.vcf.gz", + chrom = chrs + ) + return(vcfs) + + +def _gatk_rnaseq_get_chr_tbis(wildcards): + CFG = config["lcr-modules"]["gatk_rnaseq"] + chrs = reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.txt") + with open(chrs) as file: + chrs = file.read().rstrip("\n").split("\n") + tbis = expand( + CFG["dirs"]["gatk_variant_calling"] + "{{seq_type}}--{{genome_build}}/{{sample_id}}.{chrom}.vcf.gz.tbi", + chrom = chrs + ) + return(tbis) + + +# Merge chromosome gatk_rnaseq VCFs from the same sample +rule _gatk_rnaseq_merge_vcfs: + input: + vcf = _gatk_rnaseq_get_chr_vcfs, + tbi = _gatk_rnaseq_get_chr_tbis + output: + vcf = CFG["dirs"]["merge_vcfs"] + "{seq_type}--{genome_build}/{sample_id}.output.vcf.gz", + tbi = CFG["dirs"]["merge_vcfs"] + "{seq_type}--{genome_build}/{sample_id}.output.vcf.gz.tbi" + log: + stderr = CFG["logs"]["merge_vcfs"] + "{seq_type}--{genome_build}/{sample_id}.merge_vcfs_merge_vcfs.stderr.log" + conda: + CFG["conda_envs"]["bcftools"] + threads: + CFG["threads"]["merge_vcfs"] + resources: + **CFG["resources"]["merge_vcfs"] + params: + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + shell: + op.as_one_line(""" + bcftools concat --threads {threads} -a -O z {input.vcf} 2> {log.stderr} + | + bcftools sort -m {params.mem_mb}M -O z -o {output.vcf} 2>> {log.stderr} + && + bcftools index -t --threads {threads} {output.vcf} 2>> {log.stderr} + """) + +# select SNPs only since INDELs are not reliable +rule _gatk_variant_selection: + input: + vcf = str(rules._gatk_rnaseq_merge_vcfs.output.vcf), + tbi = str(rules._gatk_rnaseq_merge_vcfs.output.tbi), + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: + vcf = temp(CFG["dirs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.SNP.vcf.gz"), + tbi = temp(CFG["dirs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.SNP.vcf.gz.tbi") + log: + stdout = CFG["logs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.variant_select.stdout.log", + stderr = CFG["logs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.variant_select.stderr.log" + conda: + CFG["conda_envs"]["gatk_rnaseq"] + threads: CFG["threads"]["gatk_variant_filtration"] + resources: + **CFG["resources"]["gatk_variant_filtration"] + shell: + op.as_one_line(""" + gatk --java-options "-Xmx{params.mem_mb}m {params.opts}" SelectVariants -R {input.fasta} -V {input.vcf} + --select-type-to-include SNP -restrict-alleles-to BIALLELIC + -O {output.vcf} > {log.stdout} 2> {log.stderr} + """) + + +rule _gatk_variant_filtration: + input: + vcf = str(rules._gatk_variant_selection.output.vcf), + tbi = str(rules._gatk_variant_selection.output.tbi), + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: + vcf = CFG["dirs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.filtered.vcf.gz", + tbi = CFG["dirs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.filtered.vcf.gz.tbi" + log: + stdout = CFG["logs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.variant_filt.stdout.log", + stderr = CFG["logs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.variant_filt.stderr.log" + params: + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), + opts = CFG["options"]["java_opts"], + window = CFG["options"]["gatk_variant_filtration"]["window"], + cluster_size = CFG["options"]["gatk_variant_filtration"]["cluster_size"], + filter_expression = CFG["options"]["gatk_variant_filtration"]["filter_expression"] + conda: + CFG["conda_envs"]["gatk_rnaseq"] + threads: CFG["threads"]["gatk_variant_filtration"] + resources: + **CFG["resources"]["gatk_variant_filtration"] + shell: + # flag to remove: FS - phred score with strand bias > 30; QD - Variant conf/qual by depth < 2; DP - read depth < 5 + op.as_one_line(""" + gatk --java-options "-Xmx{params.mem_mb}m {params.opts}" VariantFiltration -R {input.fasta} -V {input.vcf} + -window {params.window} -cluster-size {params.cluster_size} + {params.filter_expression} + -O {output.vcf} > {log.stdout} 2> {log.stderr} + """) + + +# minimum AD threshold (min 5 reads that support the alternate allele) +rule _gatk_filter_AD: + input: + vcf = str(rules._gatk_variant_filtration.output.vcf), + tbi = str(rules._gatk_variant_filtration.output.tbi) + output: + vcf = temp(CFG["dirs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.AD.filtered.vcf.gz"), + tbi = temp(CFG["dirs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.AD.filtered.vcf.gz.tbi") + log: + stderr = CFG["logs"]["gatk_variant_filtration"] + "{seq_type}--{genome_build}/{sample_id}.AD.variant_filt.stderr.log" + conda: + CFG["conda_envs"]["bcftools"] + threads: CFG["threads"]["gatk_variant_filtration"] + resources: + **CFG["resources"]["gatk_variant_filtration"] + shell: + op.as_one_line(""" + bcftools filter -i 'AD[0:1-] > 5' -Oz -o {output.vcf} {input.vcf} 2> {log.stderr} + && + tabix -p vcf {output.vcf} 2>> {log.stderr} + """) + +# Filters for PASS variants +rule _gatk_rnaseq_filter_passed: + input: + vcf = str(rules._gatk_filter_AD.output.vcf), + tbi = str(rules._gatk_filter_AD.output.tbi) + output: + vcf = CFG["dirs"]["passed"] + "{seq_type}--{genome_build}/temp/{sample_id}.output.passed.vcf.gz", + tbi = CFG["dirs"]["passed"] + "{seq_type}--{genome_build}/temp/{sample_id}.output.passed.vcf.gz.tbi" + params: + opts = CFG["options"]["gatk_rnaseq_filter_passed"]["params"] + log: + stderr = CFG["logs"]["passed"] + "{seq_type}--{genome_build}/{sample_id}.mutect2_filter_passed.stderr.log" + conda: + CFG["conda_envs"]["bcftools"] + threads: + CFG["threads"]["gatk_rnaseq_passed"] + resources: + **CFG["resources"]["gatk_rnaseq_passed"] + shell: + op.as_one_line(""" + bcftools view {params.opts} -Oz -o {output.vcf} {input.vcf} 2> {log.stderr} + && + tabix -p vcf {output.vcf} 2>> {log.stderr} + """) + + +# Remove AF tag in vcfs: +rule _gatk_rnaseq_removeAF: + input: + vcf = str(rules._gatk_rnaseq_filter_passed.output.vcf), + tbi = str(rules._gatk_rnaseq_filter_passed.output.tbi) + output: + vcf = temp(CFG["dirs"]["gnomad_filter"] + "{seq_type}--{genome_build}/{sample_id}.combined.removeAF.vcf.gz"), + tbi = temp(CFG["dirs"]["gnomad_filter"] + "{seq_type}--{genome_build}/{sample_id}.combined.removeAF.vcf.gz.tbi") + log: + stderr = CFG["logs"]["gnomad_filter"] + "{seq_type}--{genome_build}/{sample_id}.removeAF.stderr.log" + conda: + CFG["conda_envs"]["bcftools"] + threads: + CFG["threads"]["gnomad_filter"] + resources: + **CFG["resources"]["gnomad_filter"] + shell: + op.as_one_line(""" + bcftools annotate --threads {threads} -x INFO/AF {input.vcf} -Oz -o {output.vcf} 2> {log.stderr} + && + tabix -p vcf {output.vcf} 2>> {log.stderr} + """) + + +# Annotate and filter out common gnomad variants +rule _gatk_rnaseq_annotate_gnomad: + input: + vcf = str(rules._gatk_rnaseq_removeAF.output.vcf), + tbi = str(rules._gatk_rnaseq_removeAF.output.tbi), + gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz") + output: + vcf = CFG["dirs"]["gnomad_filter"] + "{seq_type}--{genome_build}/{sample_id}.combined.gnomad.vcf.gz", + tbi = CFG["dirs"]["gnomad_filter"] + "{seq_type}--{genome_build}/{sample_id}.combined.gnomad.vcf.gz.tbi" + log: + stderr = CFG["logs"]["gnomad_filter"] + "{seq_type}--{genome_build}/{sample_id}.gnomad_filter.stderr.log" + conda: + CFG["conda_envs"]["bcftools"] + threads: + CFG["threads"]["gnomad_filter"] + resources: + **CFG["resources"]["gnomad_filter"] + shell: + op.as_one_line(""" + bcftools annotate --threads {threads} -a {input.gnomad} -c INFO/AF {input.vcf} | + awk 'BEGIN {{FS=OFS="\\t"}} {{ if ($1 !~ /^#/ && $8 !~ ";AF=") $8=$8";AF=0"; print $0; }}' | + bcftools view -i 'INFO/AF < 0.0001' -Oz -o {output.vcf} 2> {log.stderr} + && + tabix -p vcf {output.vcf} 2>> {log.stderr} + """) + + +# Symlinks the final output files into the module results directory (under '99-outputs/') +rule _gatk_rnaseq_output_vcf: + input: + vcf = str(rules._gatk_rnaseq_annotate_gnomad.output.vcf), + tbi = str(rules._gatk_rnaseq_annotate_gnomad.output.tbi) + output: + vcf = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{sample_id}.output.filt.vcf.gz", + tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{sample_id}.output.filt.vcf.gz.tbi" + run: + op.relative_symlink(input.vcf, output.vcf, in_module = True) + op.relative_symlink(input.tbi, output.tbi, in_module = True) + + +# Generates the target sentinels for each run, which generate the symlinks +rule _gatk_rnaseq_all: + input: + expand( + [ + str(rules._gatk_rnaseq_output_vcf.output.vcf), + str(rules._gatk_rnaseq_output_vcf.output.tbi) + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + sample_id=CFG["runs"]["tumour_sample_id"]) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/gatk_rnaseq/1.0/schemas/base-1.0.yaml b/modules/gatk_rnaseq/1.0/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/gatk_rnaseq/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/gatk_rnaseq/CHANGELOG.md b/modules/gatk_rnaseq/CHANGELOG.md new file mode 100644 index 00000000..db532cdb --- /dev/null +++ b/modules/gatk_rnaseq/CHANGELOG.md @@ -0,0 +1,18 @@ +# Changelog + +All notable changes to the `gatk_rnaseq` module will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0] - 2021-04-23 + +This release was authored by Jasper Wong. + +The gatk_rnaseq module follows GATK's variant-calling for RNA-seq pipeline, which involves: +1) SplitNCigarReads - Splitting reads into exon segments (getting rid of N's but keeping grouping information) and hard-clipping overhangs (that cross into intron regions). By default, this also reassigns mapping quality for good alignments (to match DNA alignments) +2) BaseRecalibrator (and applyBQSR) - creates a table that detects and corrects for patterns of systematic errors in the base quality scores. Applies this recalibration step on the BAMs. +3) Variant-calling - I split up the variant calling into one per chrom (24 threads), and run HaplotypeCaller. Default is phred-scaled confidence threshold of 20 for calling variants. Filters out db-snp regions. +4) Variant-filtration - Several hard-filters applied here. Flag any variants that are FS > 30, QD < 2, DP < 5. +5) Select only passed variants. +