diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7096c7da..8e4525ad 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -16,6 +16,8 @@ - [ ] Input and output files are being symlinked into the `CFG["inputs"]` and `CFG["outputs"]` subdirectories, respectively. +- [ ] I grouped the input symlinking rule to the next job that uses the input files. + - [ ] I updated the final target rule (`*_all`) to include every output rule. - [ ] I explained important module design decisions in `CHANGELOG.md`. @@ -48,4 +50,11 @@ ## Checklist for Updated Module -To be completed. +Important! If you are updating the module version, ensure the previous version of the module is restored from master. +If you want to restore a deleted file or directory from the remote master, you can use `git checkout origin/master path/to/file`, +then a `git commit` will ensure that file is tracked on your branch again. +Example: +``` +mv modules/strelka/1.1 modules/strelka/1.2 +git checkout origin/master modules/strelka/1.1 +``` diff --git a/demo/Snakefile b/demo/Snakefile deleted file mode 100755 index 96d9827b..00000000 --- a/demo/Snakefile +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env snakemake - - -##### SETUP ##### - -import oncopipe as op - -SAMPLES = op.load_samples("data/samples.tsv") -CAPTURE = op.filter_samples(SAMPLES, seq_type = "capture") - - -##### REFERENCE_FILES WORKFLOW ##### - - -subworkflow reference_files: - workdir: - "reference/" - snakefile: - "../workflows/reference_files/2.4/reference_files.smk" - configfile: - "../workflows/reference_files/2.4/config/default.yaml" - - -##### CONFIGURATION FILES ##### - - -# Load module-specific configuration -configfile: "../modules/utils/2.1/config/default.yaml" -configfile: "../modules/picard_qc/1.0/config/default.yaml" -configfile: "../modules/salmon/1.1/config/default.yaml" -configfile: "../modules/bam2fastq/1.2/config/default.yaml" -configfile: "../modules/star/1.4/config/default.yaml" -configfile: "../modules/manta/2.3/config/default.yaml" -configfile: "../modules/gridss/1.1/config/default.yaml" -configfile: "../modules/vcf2maf/1.2/config/default.yaml" -configfile: "../modules/sequenza/1.4/config/default.yaml" -configfile: "../modules/strelka/1.1/config/default.yaml" -configfile: "../modules/bwa_mem/1.1/config/default.yaml" -configfile: "../modules/controlfreec/1.1/config/default.yaml" -configfile: "../modules/lofreq/1.0/config/default.yaml" -configfile: "../modules/starfish/2.0/config/default.yaml" -configfile: "../modules/sage/1.0/config/default.yaml" -configfile: "../modules/slms_3/1.0/config/default.yaml" -configfile: "../modules/ichorcna/1.0/config/default.yaml" -configfile: "../modules/gatk_rnaseq/1.0/config/default.yaml" - -# Load project-specific config, which includes the shared -# configuration and some module-specific config updates -configfile: "config.yaml" - - -##### CONFIGURATION UPDATES ##### - - -# Use all samples as a default sample list for each module -config["lcr-modules"]["_shared"]["samples"] = SAMPLES -config["lcr-modules"]["starfish"]["samples"] = CAPTURE - -##### MODULE SNAKEFILES ##### - - -# Load module-specific snakefiles - -include: "../modules/slms_3/1.0/slms_3.smk" -include: "../modules/utils/2.1/utils.smk" -include: "../modules/picard_qc/1.0/picard_qc.smk" -include: "../modules/salmon/1.1/salmon.smk" -include: "../modules/star/1.4/star.smk" -include: "../modules/manta/2.3/manta.smk" -include: "../modules/vcf2maf/1.2/vcf2maf.smk" -include: "../modules/sequenza/1.4/sequenza.smk" -include: "../modules/strelka/1.1/strelka.smk" -include: "../modules/bwa_mem/1.1/bwa_mem.smk" -include: "../modules/gridss/1.1/gridss.smk" -include: "../modules/bam2fastq/1.2/bam2fastq.smk" -include: "../modules/controlfreec/1.1/controlfreec.smk" -include: "../modules/lofreq/1.0/lofreq.smk" -include: "../modules/starfish/2.0/starfish.smk" -include: "../modules/sage/1.0/sage.smk" -include: "../modules/ichorcna/1.0/ichorcna.smk" -include: "../modules/gatk_rnaseq/1.0/gatk_rnaseq.smk" - -##### TARGETS ###### - -rule all: - input: - rules._picard_qc_all.input, - rules._salmon_all.input, - rules._bam2fastq_all.input, - rules._star_all.input, - rules._manta_all.input, - rules._sequenza_all.input, - rules._lofreq_all.input, - rules._strelka_all.input, - rules._bwa_mem_all.input, - rules._liftover_all.input, - rules._controlfreec_all.input, - rules._gridss_all.input, - rules._controlfreec_all.input, - rules._starfish_all.input, - rules._vcf2maf_all.input, - rules._sage_all.input, - rules._slms_3_all.input, - rules._ichorcna_all.input, - rules._gatk_rnaseq_all.input - diff --git a/demo/config.yaml b/demo/config.yaml deleted file mode 100755 index 74022ead..00000000 --- a/demo/config.yaml +++ /dev/null @@ -1,179 +0,0 @@ -lcr-modules: - - _shared: - lcr-modules: "../" - lcr-scripts: "../../lcr-scripts/" - root_output_dir: "results/" - scratch_directory: "scratch/" - unmatched_normal_ids: - capture--grch37: "TCRBOA7-N-WEX" - - slms_3: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - - - bam2fastq: - inputs: - sample_bam: "data/{sample_id}.bam" - temp_outputs: False # fastq outputs will be temporary - - star: - inputs: - sample_fastq_1: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read1.fastq.gz" - sample_fastq_2: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read2.fastq.gz" - scratch_subdirectories: [] - - manta: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - - mixcr: - inputs: - sample_fastq_1: "data/{sample_id}.read1.fastq.gz" - sample_fastq_2: "data/{sample_id}.read2.fastq.gz" - - vcf2maf: - dirs: - _parent: "results/sage-1.0_vcf2maf-1.2" - inputs: - vep_cache: "reference/vep_caches/" - sample_vcf_gz: "results/sage-1.0/99-outputs/combined/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.{base_name}.vcf.gz" - convert_coord: "{SCRIPTSDIR}/crossmap/1.0/convert_maf_coords.sh" - vcf_base_name: "sage.combined" - options: - vcf2maf: "--filter-vcf 0 --vcf-tumor-id {tumour_id} --vcf-normal-id {normal_id}" - species: "homo_sapiens" - conda_envs: - vcf2maf: "{MODSDIR}/envs/vcf2maf-1.6.18.yaml" - crossmap: "{SCRIPTSDIR}/crossmap/1.0/convert_maf_coords.yaml" - # here you can specify path to txt file with a list of custom ENST IDs that override canonical selection - # it will be parsed to --custom-enst flag of vcf2maf - # if no non-canonical transcript IDs to be included, leave switches empty - # This is just an example of how to include the list of custom IDs - switches: - custom_enst: - hg38: "" - grch37: "data/custom_enst.txt" - hs37d5: "" - resources: - vcf2maf: - mem_mb: 12000 - vcf: 1 - crossmap: - mem_mb: 12000 - - - salmon: - inputs: - sample_fastq_1: "data/{sample_id}.read1.fastq.gz" - sample_fastq_2: "data/{sample_id}.read2.fastq.gz" - transcriptome: - quant_to: "hg38" - - sequenza: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - scratch_subdirectories: [] - - lofreq: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - lofreq_filter: "{MODSDIR}/src/bash/lofreq_filter.sh" - switches: - # Intentionally running LoFreq without a BED file for simplicity - # And to avoid having to include a large BED file in the repo - regions_bed: - _default: "" - capture: "" - - gridss: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - references: - # See the current gridss module config file for details about where to obtain this file. - viral_fa: "/projects/rmorin/projects/DLBCL_DHITsig_genomes/reference/gridss/refgenomes/human_virus/human_virus.fa" - viral_bwa_prefix: "/projects/rmorin/projects/DLBCL_DHITsig_genomes/reference/gridss/refgenomes/human_virus/human_virus.fa" - pon_dir: "/projects/rmorin/reference/hmftools-references/gridss/pon" - - strelka: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - # if using manta output, use vcf file in the 99-outputs subdirectory and ensure manta version corresponds to the loaded module - candidate_small_indels: "results/manta-2.3/99-outputs/vcf/{seq_type}--{genome_build}/candidateSmallIndels/{tumour_id}--{normal_id}--{pair_status}.candidateSmallIndels.vcf" - - utils: - inputs: - bed: - grch37: "data/exome_bed/hg19/target_regions.nochr.bed" # make sure this corresponds with config["lcr-modules"]["picard_qc"]["inputs"]["intervals"] - # if testing on GSC, use this file: "/projects/dscott_prj/CCSRI_1500/exomes/ref/agilent/hg19/target_regions.nochr.bed" - mem_mb: - bam_sort: 48000 - threads: - bam_sort: 12 - - picard_qc: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - switches: - capture_intervals: - _default: "reference/exomes/grch37/interval/target_regions.nochr_intervals.txt" - # if 'capture_kit_id' is a column in samples.tsv and contain more than one kit_id, specify each kit using the values in the column. e.g. and add the corresponding bed file if needed - # S07604624: "reference/exomes/grch37/interval/S07604624_intervals.txt" - # : "reference/exomes/grch38/interval/_intervals.txt" - - bwa_mem: - inputs: - sample_fastq_1: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read1.fastq.gz" - sample_fastq_2: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read2.fastq.gz" - scratch_subdirectories: [] - - controlfreec: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - - liftover: - tool: "battenberg" - inputs: - sample_seg: "data/{tool}/hg38/{tumour_sample_id}--{normal_sample_id}_subclones.igv.seg" - - ichorcna: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - - scratch_subdirectories: [] # mpileup should be in scratch space - - starfish: - dirs: - _parent: "results/starfish-2.0_strelka-1.1_lofreq-1.0" - inputs: - names: ["strelka", "lofreq"] - paths: - [ - "results/strelka-1.1/99-outputs/vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.strelka.combined.vcf.gz", - "results/lofreq-1.0/99-outputs/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.lofreq.snvs.vcf.gz" - ] - - sage: - inputs: - # Available wildcards: {seq_type} {genome_build} {sample_id} - sample_bam: "data/{sample_id}.bam" - - # include here any additional flags to modify default parameters - options: - sage_run: "" - - - gatk_rnaseq: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" \ No newline at end of file diff --git a/demo/data/samples.tsv b/demo/data/samples.tsv index 1efd24dc..158f6972 100755 --- a/demo/data/samples.tsv +++ b/demo/data/samples.tsv @@ -2,3 +2,5 @@ sample_id seq_type patient_id tissue_status genome_build strand read_length TCRBOA7-N-WEX capture TCRBOA7 normal grch37 positive 100 TCRBOA7-T-WEX capture TCRBOA7 tumour grch37 positive 100 TCRBOA7-T-RNA mrna TCRBOA7 tumour grch37 positive 100 +TCRBOA7-N-WGS genome TCRBOA7 normal grch37 positive 100 +TCRBOA7-T-WGS genome TCRBOA7 tumour grch37 positive 100 diff --git a/demo/dry-run.sh b/demo/dry-run.sh index 326a267e..eba90895 100755 --- a/demo/dry-run.sh +++ b/demo/dry-run.sh @@ -1,6 +1,18 @@ #!/bin/bash +# Launches a snakefile of your choice in dry run mode (for debugging) +# Usage: ./dry_run.sh "" +# Example: ./dry_run.sh example.smk example_all +# snakefile.smk The snakefile you want to run +# target_rule: The name of one of the target rules specified in one of the included Snakefiles +# snakemake_flags: One or more flags for the snakemake to run, specified inside quotation marks + + # Default to all targets -TARGETS=${@:-all} +snakefile=$1 +TARGETS=${2:-all} +snakemake_flags=$3 + +snakemake --dryrun --cores 24 $snakemake_flags -s $snakefile --printshellcmds --reason --use-conda $TARGETS + -snakemake --dryrun --cores 24 --printshellcmds --reason --use-conda $TARGETS diff --git a/demo/run.sh b/demo/run.sh index aeda0d1e..8e711e96 100755 --- a/demo/run.sh +++ b/demo/run.sh @@ -1,7 +1,18 @@ #!/bin/bash + +# Launches a snakefile of your choice in dry run mode (for debugging) +# Usage: ./dry_run.sh "" +# Example: ./dry_run.sh example.smk example_all +# snakefile.smk The snakefile you want to run +# target_rule: The name of one of the target rules specified in one of the included Snakefiles +# snakemake_flags: One or more flags for the snakemake to run, specified inside quotation marks + + # Default to all targets -TARGETS=${@:-all} +snakefile=$1 +TARGETS=${2:-all} +snakemake_flags=$3 # Determine the number of available cores for parallelization NUM_CORES=$(grep -c '^processor' /proc/cpuinfo) @@ -17,4 +28,6 @@ if (( $CORES_AVAILABLE <= 0 )); then echo "Check out top/htop to see what other jobs are currently running." exit 1 fi -nice -n 10 snakemake --cores "${CORES_AVAILABLE}" --keep-going --latency-wait 120 --use-conda "$TARGETS" +nice -n 10 snakemake --cores "${CORES_AVAILABLE}" $snakemake_flags -s $snakefile --keep-going --latency-wait 120 --use-conda $TARGETS + + diff --git a/docs/source/for_developers.rst b/docs/source/for_developers.rst index 71a17ffe..dfe1a86f 100644 --- a/docs/source/for_developers.rst +++ b/docs/source/for_developers.rst @@ -21,7 +21,7 @@ Getting Started # conda create -n lcr-modules "python>=3.6" # conda activate lcr-modules - conda install cookiecutter git + conda install -c conda-forge cookiecutter 4. Clone the `lcr-modules repository`_ and the `lcr-scripts repository`_. diff --git a/images/module_levels.png b/images/module_levels.png index f1cbbe41..15a362aa 100644 Binary files a/images/module_levels.png and b/images/module_levels.png differ diff --git a/modules/bam2fastq/1.2/bam2fastq.smk b/modules/bam2fastq/1.2/bam2fastq.smk index 43e0a244..161cd387 100644 --- a/modules/bam2fastq/1.2/bam2fastq.smk +++ b/modules/bam2fastq/1.2/bam2fastq.smk @@ -15,6 +15,27 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bam2fastq"]` CFG = op.setup_module( @@ -57,7 +78,7 @@ rule _bam2fastq_input_bam: output: bam = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bam" run: - op.relative_symlink(input, output.bam) + op.absolute_symlink(input, output.bam) # Conditional rules depending on whether or not fastq outputs will be temporary @@ -126,8 +147,8 @@ rule _bam2fastq_output: fastq_1 = CFG["dirs"]["outputs"] + "{seq_type}/{sample_id}.read1.fastq.gz", fastq_2 = CFG["dirs"]["outputs"] + "{seq_type}/{sample_id}.read2.fastq.gz" run: - op.relative_symlink(input.fastq_1, output.fastq_1) - op.relative_symlink(input.fastq_2, output.fastq_2) + op.relative_symlink(input.fastq_1, output.fastq_1, in_module = True) + op.relative_symlink(input.fastq_2, output.fastq_2, in_module = True) rule _bam2fastq_all: diff --git a/modules/bam2fastq/1.2/bam2fastq_grouped.smk b/modules/bam2fastq/1.2/bam2fastq_grouped.smk index df5f5657..741ff1a4 100644 --- a/modules/bam2fastq/1.2/bam2fastq_grouped.smk +++ b/modules/bam2fastq/1.2/bam2fastq_grouped.smk @@ -15,6 +15,27 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bam2fastq"]` CFG = op.setup_module( diff --git a/modules/battenberg/1.0/battenberg.smk b/modules/battenberg/1.0/battenberg.smk index b2da0809..c2a48d82 100644 --- a/modules/battenberg/1.0/battenberg.smk +++ b/modules/battenberg/1.0/battenberg.smk @@ -15,6 +15,26 @@ import oncopipe as op import glob +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["battenberg"]` CFG = op.setup_module( @@ -47,8 +67,8 @@ rule _battenberg_input_bam: bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai" run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bam + ".bai", output.bai) + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam + ".bai", output.bai) # Installs the Battenberg R dependencies and associated software (impute2, alleleCounter) # Currently I think this rule has to be run twice for it to work properly because the conda environment is created here. @@ -158,9 +178,9 @@ rule _battenberg_output_seg: plots = glob.glob(params.batt_dir + "/*.png") for png in plots: bn = os.path.basename(png) - op.relative_symlink(png, params.png_dir + "/" + bn) - op.relative_symlink(input.seg, output.seg) - op.relative_symlink(input.sub, output.sub) + op.relative_symlink(png, params.png_dir + "/" + bn, in_module = True) + op.relative_symlink(input.seg, output.seg, in_module = True) + op.relative_symlink(input.sub, output.sub, in_module = True) # Generates the target sentinels for each run, which generate the symlinks rule _battenberg_all: diff --git a/modules/battenberg/1.0/config/default.yaml b/modules/battenberg/1.0/config/default.yaml index f69b5d00..647d82bb 100644 --- a/modules/battenberg/1.0/config/default.yaml +++ b/modules/battenberg/1.0/config/default.yaml @@ -6,7 +6,7 @@ lcr-modules: sample_bam: "__UPDATE__" battenberg_script: "{MODSDIR}/src/R/battenberg_wgs_hg38.R" calc_sex_status: "{MODSDIR}/src/bash/calc_sex_status.sh" - cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.0/cnv2igv.py" + cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" #TODO: this should be tested with v1.2 of cnv2igv.py scratch_subdirectories: [] diff --git a/modules/battenberg/1.1/config/default.yaml b/modules/battenberg/1.1/config/default.yaml index 384415ef..5ae3a317 100644 --- a/modules/battenberg/1.1/config/default.yaml +++ b/modules/battenberg/1.1/config/default.yaml @@ -5,7 +5,7 @@ lcr-modules: # Available wildcards: {seq_type} {genome_build} {sample_id} sample_bam: "__UPDATE__" battenberg_script: "{MODSDIR}/src/battenberg_wgs_hg38.R" - cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.3/cnv2igv.py" + cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" src_dir: "{MODSDIR}/src/" scratch_subdirectories: [] diff --git a/modules/battenberg/1.2/battenberg.smk b/modules/battenberg/1.2/battenberg.smk index a89b4b13..370c8033 100644 --- a/modules/battenberg/1.2/battenberg.smk +++ b/modules/battenberg/1.2/battenberg.smk @@ -44,7 +44,7 @@ CFG = op.setup_module( ) #set variable for prepending to PATH based on config -SCRIPT_PATH = CFG['inputs']['src_dir'] +BATTENBERG_SCRIPT_PATH = CFG['inputs']['src_dir'] #this is used in place of the shell.prefix() because that was not working consistently. This is not ideal. #this preserves the variable when using lambda functions @@ -54,7 +54,7 @@ _battenberg_CFG = CFG localrules: _battenberg_all -VERSION_MAP = { +BATTENBERG_VERSION_MAP = { "hg19": "grch37", "grch37": "grch37", "hs37d5": "grch37", @@ -77,10 +77,10 @@ rule _battenberg_get_reference: genomesloci = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_1000genomesloci2012_v3") params: url = "https://www.bcgsc.ca/downloads/morinlab/reference", - alt_build = lambda w: VERSION_MAP[w.genome_build], + alt_build = lambda w: BATTENBERG_VERSION_MAP[w.genome_build], folder = CFG["dirs"]["inputs"] + "reference/{genome_build}", build = "{genome_build}", - PATH = CFG['inputs']['src_dir'] + battenberg_path = CFG['inputs']['src_dir'] resources: **CFG["resources"]["reference"] threads: @@ -98,7 +98,7 @@ rule _battenberg_get_reference: && wget -O {output.impute_info} {params.url}/impute_info_{params.alt_build}.txt && - python {params.PATH}/reference_correction.py {params.build} + python {params.battenberg_path}/reference_correction.py {params.build} $(dirname $(readlink -f {output.impute_info})) && wget -qO- {params.url}/battenberg_{params.alt_build}_replic_correction.tar.gz | tar -xvz > {output.battenberg_wgs_replic_correction} -C {params.folder} @@ -155,7 +155,7 @@ rule _infer_patient_sex: threads: 8 shell: op.as_one_line(""" - PATH={SCRIPT_PATH}:$PATH; + PATH={BATTENBERG_SCRIPT_PATH}:$PATH; echo "running {rule} for {wildcards.normal_id} on $(hostname) at $(date)" > {log.stderr} ; calc_sex_status.sh {input.normal_bam} {input.fasta} {wildcards.normal_id} > {output.sex_result} 2>> {log.stderr} && echo "DONE running {rule} for {wildcards.normal_id} on $(hostname) at $(date)" >> {log.stderr} @@ -171,7 +171,7 @@ rule _run_battenberg: installed = CFG["dirs"]["inputs"] + "battenberg_dependencies_installed.success", sex_result = CFG["dirs"]["infer_sex"] + "{seq_type}--{genome_build}/{normal_id}.sex", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), - impute_info = CFG["dirs"]["inputs"] + "reference/{genome_build}/impute_info.txt" + impute_info = str(rules._battenberg_get_reference.output.impute_info) output: refit=CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_refit_suggestion.txt", @@ -205,7 +205,7 @@ rule _run_battenberg: sex=$(cut -f 4 {input.sex_result}| tail -n 1); echo "setting sex as $sex"; Rscript {params.script} -t {wildcards.tumour_id} - -n {wildcards.normal_id} --tb {input.tumour_bam} --nb {input.normal_bam} -f {input.fasta} --reference {params.ref} + -n {wildcards.normal_id} --tb $(readlink -f {input.tumour_bam}) --nb $(readlink -f {input.normal_bam}) -f {input.fasta} --reference $(readlink -f {params.ref}) -o {params.out_dir} --chr_prefixed_genome $chr_prefixed --sex $sex --cpu {threads} >> {log.stdout} 2>> {log.stderr} && echo "DONE {rule} for {wildcards.tumour_id}--{wildcards.normal_id} on $(hostname) at $(date)" >> {log.stdout}; """) diff --git a/modules/battenberg/1.2/src/battenberg_wgs_hg38.R b/modules/battenberg/1.2/src/battenberg_wgs_hg38.R index fb9f3686..5f3a2707 100755 --- a/modules/battenberg/1.2/src/battenberg_wgs_hg38.R +++ b/modules/battenberg/1.2/src/battenberg_wgs_hg38.R @@ -26,7 +26,7 @@ opt_parser = OptionParser(option_list=option_list) opt = parse_args(opt_parser) original_dir = getwd() -REFERENCE_BASE = paste0(normalizePath(original_dir,"\\"), "/",opt$reference) +REFERENCE_BASE = opt$reference TUMOURNAME = opt$tumourname NORMALNAME = opt$normalname @@ -83,8 +83,9 @@ print(PROBLEMLOCI); # Change to work directory and load the chromosome information setwd(RUN_DIR) -NORMALBAM = paste0(normalizePath(original_dir,"\\"), "/",opt$nb) -TUMOURBAM = paste0(normalizePath(original_dir,"\\"), "/",opt$tb) +NORMALBAM = opt$nb +TUMOURBAM = opt$tb + #this should be the full path to the files after changing directories diff --git a/modules/battenberg/1.2/src/reference_correction.py b/modules/battenberg/1.2/src/reference_correction.py index 4b2b881c..be482fe7 100644 --- a/modules/battenberg/1.2/src/reference_correction.py +++ b/modules/battenberg/1.2/src/reference_correction.py @@ -27,12 +27,10 @@ import os import sys -cwd = os.getcwd() +cwd = sys.argv[2] fileIN = open( cwd - + "/results/battenberg-1.2/00-inputs/reference/" - + sys.argv[1] + "/impute_info.txt", "r", ) @@ -42,15 +40,11 @@ newdata = filedata.replace( "", cwd - + "/results/battenberg-1.2/00-inputs/reference/" - + sys.argv[1] + "/battenberg_impute_v3", ) fileOut = open( cwd - + "/results/battenberg-1.2/00-inputs/reference/" - + sys.argv[1] + "/impute_info.txt", "w", ) diff --git a/modules/bwa_mem/1.1/bwa_mem.smk b/modules/bwa_mem/1.1/bwa_mem.smk index beba1e7f..5d17a32a 100644 --- a/modules/bwa_mem/1.1/bwa_mem.smk +++ b/modules/bwa_mem/1.1/bwa_mem.smk @@ -16,6 +16,26 @@ import os # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bwa_mem"]` CFG = op.setup_module( @@ -48,8 +68,8 @@ rule _bwa_mem_input_fastq: fastq_1 = CFG["dirs"]["inputs"] + "fastq/{seq_type}--{genome_build}/{sample_id}.R1.fastq.gz", fastq_2 = CFG["dirs"]["inputs"] + "fastq/{seq_type}--{genome_build}/{sample_id}.R2.fastq.gz", run: - op.relative_symlink(input.fastq_1, output.fastq_1) - op.relative_symlink(input.fastq_2, output.fastq_2) + op.absolute_symlink(input.fastq_1, output.fastq_1) + op.absolute_symlink(input.fastq_2, output.fastq_2) rule _bwa_mem_run: @@ -118,7 +138,7 @@ rule _bwa_mem_symlink_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) rule _bwa_mem_symlink_sorted_bam: @@ -130,7 +150,7 @@ rule _bwa_mem_symlink_sorted_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) os.remove(input.bwa_mem_bam) shell("touch {input.bwa_mem_bam}.deleted") @@ -146,8 +166,8 @@ rule _bwa_mem_output_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bam + ".bai") + op.relative_symlink(input.bam, output.bam, in_module=True) + op.relative_symlink(input.bai, output.bam + ".bai", in_module=True) os.remove(input.sorted_bam) shell("touch {input.sorted_bam}.deleted") diff --git a/modules/bwa_mem/1.1/bwa_mem_grouped.smk b/modules/bwa_mem/1.1/bwa_mem_grouped.smk index 22bc7229..05afba60 100644 --- a/modules/bwa_mem/1.1/bwa_mem_grouped.smk +++ b/modules/bwa_mem/1.1/bwa_mem_grouped.smk @@ -16,6 +16,26 @@ import os # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bwa_mem"]` CFG = op.setup_module( @@ -50,8 +70,8 @@ rule _bwa_mem_input_fastq: group: CFG["group"]['bwa-mem'] run: - op.relative_symlink(input.fastq_1, output.fastq_1) - op.relative_symlink(input.fastq_2, output.fastq_2) + op.absolute_symlink(input.fastq_1, output.fastq_1) + op.absolute_symlink(input.fastq_2, output.fastq_2) rule _bwa_mem_run: @@ -121,7 +141,7 @@ rule _bwa_mem_symlink_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) rule _bwa_mem_symlink_sorted_bam: @@ -133,7 +153,7 @@ rule _bwa_mem_symlink_sorted_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) os.remove(input.bwa_mem_bam) shell("touch {input.bwa_mem_bam}.deleted") @@ -149,8 +169,8 @@ rule _bwa_mem_output_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bam + ".bai") + op.relative_symlink(input.bam, output.bam, in_module=True) + op.relative_symlink(input.bai, output.bam + ".bai", in_module=True) os.remove(input.sorted_bam) shell("touch {input.sorted_bam}.deleted") diff --git a/modules/controlfreec/1.2/config/default.yaml b/modules/controlfreec/1.2/config/default.yaml index bf32925e..79fbc09d 100755 --- a/modules/controlfreec/1.2/config/default.yaml +++ b/modules/controlfreec/1.2/config/default.yaml @@ -28,18 +28,19 @@ lcr-modules: # 3: make a separate fragment of the unknown region and attach to left/right, choosing the longer one, BUT known region should make at least half size of the unknown region # 4: make a separate fragment of the unknown region and do not assign any copy number to this region at all coefficientOfVariation: 0.062 # coefficient used to evaluate window size - the lower, the more windows - contaminationAdjustment: TRUE # if "contamination" value is not provided, it will automaticaly evaluate - degree: '3\&4' # degree of polynomial - 3&4 for WGS (GC-based normalization); 1 for WES (control-read-count-based normalization) + contaminationAdjustment: TRUE # if "contamination" value is not provided, it will automaticaly evaluate. For bugs where contamination detection is stalled, just set contaminationAdjustment to FALSE. + degree: '3\&4' # degree of polynomial - 3&4 for WGS (GC-based normalization); 1 for WES (control-read-count-based normalization). You can comment out degree to let control-freec choose. forceGCcontentNormalization: 1 #0 for WGS; 1 for WES # 0 forces control-base normalization, 1 forces GC intercept: 1 # 0 for control-based (paired) ; 1 for GC-content (unpaired) minCNAlength: 8 # minimum number of consecutive windows to call a CNA #default 1 for WGS; 3 for WES - minMappabilityPerWindow: 0.9 # minimum fraction of mappable positions for a window to be considered + minMappabilityPerWindow: 0.3 # minimum fraction of mappable positions for a window to be considered # set this lower if you want to also use a hard-masked mappability file minimalSubclonePresence: 20 # detects subclones present in x% of cell population - 20 for WGS; 30 for WES (100 means "do not look for subclones") noisyData: TRUE #set TRUE for exomes/FFPE libs to avoid false positives due to non-uniform capture + readCountThreshold: 10 # threshold on the minimal number of reads per window (used for exome-seq or targeted sequencing) (recommended 50 for WES) ploidy: 2 #will select the ploidy that explains the most CNAs (a range can be added and control-freec will assign ploidy based on best fit, ex. 2,3,4) printNA: FALSE telocentromeric: 50000 # size of pre-telomeric and pre-centromeric regions to exclude - uniqueMatch: TRUE # uses mappability profile to correct read counts + uniqueMatch: FALSE # uses mappability profile to correct read counts #optional options: (uncomment these options in config_WGS.txt to implement them) #if implemented, contamination will overrule contaminationAdjustment @@ -54,21 +55,36 @@ lcr-modules: minQualityPerPosition: 20 # for BAF: minimum base quality shiftInQuality: 0 # basis for Phred quality + #GEM options: (for generating hard-masked mappability files) + hard_masked: True # set True if using a hard-masked mappability file + kmer: 100 # kmer size + mismatch: 2 # maximum number of mismatches allowed + maxBigIndel: 5 # The GEM mapper implements a special algorithm that, in addition to ordinary matches, is sometimes able to find a single long indel - this is the max size + maxEditDistance: 0 # maximum number of edit operations allowed while verifying candidate matches by dynamic programming (can be a float 0-1, which represents differences of size n% of length, or a non-negative integer, which is a fixed number of edits) + strata: 0 # a stratum is a set of matches all having the same string distance from the query, GEM mapper will try to find n amount of matches to explore + software: - FREEC_sig: "{MODSDIR}/etc/scripts/assess_significance.R" - FREEC_graph: "{MODSDIR}/etc/scripts/makeGraph.R" - FREEC_graph_chr: "{MODSDIR}/etc/scripts/makeGraph_Chromosome.R" - freec2bed: "{MODSDIR}/etc/scripts/freec2bed.pl" + FREEC_sig: "{MODSDIR}/src/assess_significance.R" + FREEC_graph: "{MODSDIR}/src/makeGraph.R" + FREEC_graph_chr: "{MODSDIR}/src/makeGraph_Chromosome.R" + freec2bed: "{MODSDIR}/src/freec2bed.pl" + freec2circos: "{MODSDIR}/src/freec2circos.pl" + cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" threads: + gem: 24 controlfreec_run: 24 calc_sig: 1 plot: 1 freec2bed: 1 + freec2circos: 1 + cnv2igv: 1 resources: + gem: + mem_mb: 16000 mpileup: mem_mb: 8000 cat: @@ -83,6 +99,10 @@ lcr-modules: mem_mb: 1000 freec2bed: mem_mb: 1000 + freec2circos: + mem_mb: 1000 + cnv2igv: + mem_mb: 1000 pairing_config: diff --git a/modules/controlfreec/1.2/config/freec/config_WGS.txt b/modules/controlfreec/1.2/config/freec/config_WGS.txt index 44f46c95..1837430a 100644 --- a/modules/controlfreec/1.2/config/freec/config_WGS.txt +++ b/modules/controlfreec/1.2/config/freec/config_WGS.txt @@ -26,6 +26,7 @@ minimalSubclonePresence = minimumSubclonePresenceValue noisyData = booNoise printNA = naBoo ploidy = ploidyInput +readCountThreshold = rcCountThresold #step = stepValue telocentromeric = teloValue uniqueMatch = uniqBoo @@ -48,7 +49,6 @@ mateOrientation = FR [BAF] -fastaFile = fastaPath shiftInQuality = phredQuality SNPfile = DBsnpFile minimalCoveragePerPosition = minCovPerPos diff --git a/modules/controlfreec/1.2/controlfreec.smk b/modules/controlfreec/1.2/controlfreec.smk index af1da74b..bb7e84ce 100755 --- a/modules/controlfreec/1.2/controlfreec.smk +++ b/modules/controlfreec/1.2/controlfreec.smk @@ -15,6 +15,26 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["controlfreec"]` CFG = op.setup_module( @@ -35,45 +55,136 @@ localrules: ##### RULES ##### +#### Rules for mappability reference +# to generate and use hard-masked mappability (i.e. recommended for FFPE genomes) if CFG["options"]["hard_masked"] == True +# to use the default genome's mappability file (downloaded from their website), set it CFG["options"]["hard_masked"] == False +if CFG["options"]["hard_masked"] == True: + CFG["runs"]["masked"] = "_masked" +else: + CFG["runs"]["masked"] = "" + +wildcard_constraints: + masked = ".{0}|_masked", + genome_build = ".+(?\S+).+/$1/;print;' > {output.reference} " + +def get_genome_fasta(wildcards): + CFG = config["lcr-modules"]["controlfreec"] + if "grch" in str({wildcards.genome_build}): + return CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/genome_header.fa" + else: + return reference_files("genomes/{genome_build}{masked}/genome_fasta/genome.fa") + +if CFG["options"]["hard_masked"] == True: + rule _generate_gem_index: + input: + software = CFG["dirs"]["inputs"] + "references/GEM/.done", + reference = get_genome_fasta + output: + index = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all_index.gem" + params: + gemDir = CFG["dirs"]["inputs"] + "references/GEM/GEM-binaries-Linux-x86_64-core_i3-20130406-045632/bin", + idxpref = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all_index" + threads: CFG["threads"]["gem"] + resources: **CFG["resources"]["gem"] + log: CFG["logs"]["inputs"] + "gem/{genome_build}{masked}/gem_index.stderr.log" + shell: + "PATH=$PATH:{params.gemDir}; {params.gemDir}/gem-indexer -T {threads} -c dna -i {input.reference} -o {params.idxpref} > {log} 2>&1 " + +if CFG["options"]["hard_masked"] == True: + rule _generate_mappability: + input: + software = CFG["dirs"]["inputs"] + "references/GEM/.done", + index = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all_index.gem" + output: + mappability = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all.gem.mappability" + params: + gemDir = CFG["dirs"]["inputs"] + "references/GEM/GEM-binaries-Linux-x86_64-core_i3-20130406-045632/bin", + pref = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all.gem", + kmer = CFG["options"]["kmer"], + mismatch = CFG["options"]["mismatch"], + maxEditDistance = CFG["options"]["maxEditDistance"], + maxBigIndel = CFG["options"]["maxBigIndel"], + strata = CFG["options"]["strata"] + threads: CFG["threads"]["gem"] + resources: **CFG["resources"]["gem"] + log: CFG["logs"]["inputs"] + "gem/{genome_build}{masked}/gem_map.stderr.log" + shell: + "PATH=$PATH:{params.gemDir}; {params.gemDir}/gem-mappability -T {threads} -I {input.index} -l {params.kmer} -m {params.mismatch} -t disable --mismatch-alphabet ACGNT -e {params.maxEditDistance} --max-big-indel-length {params.maxBigIndel} -s {params.strata} -o {params.pref} > {log} 2>&1 " + +if CFG["options"]["hard_masked"] == True: + rule _symlink_map: + input: + mappability = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all.gem.mappability" + output: + mappability = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/out100m2_{genome_build}.gem" + resources: **CFG["resources"]["gem"] + shell: + "ln -srf {input.mappability} {output.mappability} " + +#### Rule for setting chromosome names (chr-prefix or not) # no chr for grch37 and grch38 # chr for hg19 and hg38 -# Symlink chromosomes used (i.e. chr1-22,X,Y) -checkpoint _controlfreec_input_chrs: - input: - chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes_withY.txt") - output: - chrs = CFG["dirs"]["inputs"] + "references/{genome_build}/main_chromosomes_withY.txt" - run: - op.relative_symlink(input.chrs, output.chrs) +# chromosomes used (i.e. chr1-22,X,Y) def _controlfreec_get_chr_fastas(wildcards): CFG = config["lcr-modules"]["controlfreec"] - chrs = checkpoints._controlfreec_input_chrs.get(**wildcards).output.chrs + chrs = reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes_withY.txt") with open(chrs) as file: chromosome = file.read().rstrip("\n").split("\n") fastas = expand( @@ -86,9 +197,11 @@ def _controlfreec_get_chr_fastas(wildcards): #generates file with chromomsome lengths from genome.fa.fai rule _controlfreec_generate_chrLen: input: - fai = reference_files("genomes/{genome_build}/genome_fasta/genome.fa.fai") + fai = reference_files("genomes/{genome_build}{masked}/genome_fasta/genome.fa.fai"), + main = reference_files("genomes/{genome_build}{masked}/genome_fasta/main_chromosomes_withY.txt") output: - chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/{genome_build}.len" + chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.len" + resources: **CFG["resources"]["gem"] shell: op.as_one_line(""" grep -P '^chr[0-9,X,Y]+\t|^[0-9,X,Y]' {input.fai} | awk '{{print $1"\t"$2}}' > {output.chrLen} @@ -102,6 +215,7 @@ rule _controlfreec_generate_chrFasta: fasta = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/chr/{chromosome}.fa" conda: CFG["conda_envs"]["controlfreec"] + resources: **CFG["resources"]["gem"] shell: "samtools faidx {input.fasta} {wildcards.chromosome} > {output.fasta} " @@ -118,6 +232,7 @@ rule _controlfreec_dbsnp_to_bed: vcf = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz") output: bed = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/dbsnp.common_all-151.bed" + resources: **CFG["resources"]["gem"] shell: op.as_one_line(""" gunzip -c {input.vcf} | awk {{'printf ("%s\\t%s\\t%s\\n", $1,$2-1,$2)'}} | zgrep -v -h "^#" > {output.bed} """) @@ -129,16 +244,18 @@ rule _controlfreec_input_bam: bai = CFG["inputs"]["sample_bai"] output: bam = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bam", - bai = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bai" + bai = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bai", + crai = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.crai" run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bai) + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + op.absolute_symlink(input.bai, output.crai) #### set-up mpileups for BAF calling #### def _controlfreec_get_chr_mpileups(wildcards): CFG = config["lcr-modules"]["controlfreec"] - chrs = checkpoints._controlfreec_input_chrs.get(**wildcards).output.chrs + chrs = reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes_withY.txt") with open(chrs) as file: chrs = file.read().rstrip("\n").split("\n") mpileups = expand( @@ -165,17 +282,16 @@ rule _controlfreec_mpileup_per_chrom: shell: "samtools mpileup -l {input.bed} -r {wildcards.chrom} -Q 20 -f {input.fastaFile} {input.bam} | gzip -c > {output.pileup} 2> {log.stderr}" - rule _controlfreec_concatenate_pileups: input: - _controlfreec_get_chr_mpileups + mpileup = _controlfreec_get_chr_mpileups output: mpileup = temp(CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{sample_id}.bam_minipileup.pileup.gz") resources: **CFG["resources"]["cat"] group: "controlfreec" shell: - "cat {input} > {output.mpileup} " + "cat {input.mpileup} > {output.mpileup} " #### Run control-FREEC #### @@ -185,19 +301,18 @@ rule _controlfreec_config: input: tumour_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{tumour_id}.bam_minipileup.pileup.gz", normal_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{normal_id}.bam_minipileup.pileup.gz", - fastaFile = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), - reference = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/out100m2_{genome_build}.gem", - chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/{genome_build}.len", + reference = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/out100m2_{genome_build}.gem", + chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.len", done = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/chr/.all_done" output: - CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt" + CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt" conda: CFG["conda_envs"]["controlfreec"] params: config = CFG["options"]["configFile"], dbSNP = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), shiftInQuality = CFG["options"]["shiftInQuality"], - outdir = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/", + outdir = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/", window = CFG["options"]["window"], ploidy = CFG["options"]["ploidy"], breakPointValue = CFG["options"]["breakPointThreshold"], @@ -217,6 +332,7 @@ rule _controlfreec_config: minimumSubclonePresence = CFG["options"]["minimalSubclonePresence"], naBoo = CFG["options"]["printNA"], noisyData = CFG["options"]["noisyData"], + readCountThreshold = CFG["options"]["readCountThreshold"], step = CFG["options"]["step"], telocentromeric = CFG["options"]["telocentromeric"], threads = CFG["threads"]["controlfreec_run"], @@ -231,7 +347,6 @@ rule _controlfreec_config: "sed \"s|BAMFILE|{input.tumour_bam}|g\" {params.config} | " "sed \"s|CONTROLFILE|{input.normal_bam}|g\" | " "sed \"s|OUTDIR|{params.outdir}|g\" | " - "sed \"s|fastaPath|{input.fastaFile}|g\" | " "sed \"s|DBsnpFile|{params.dbSNP}|g\" | " "sed \"s|phredQuality|{params.shiftInQuality}|g\" | " "sed \"s|windowSize|{params.window}|g\" | " @@ -257,6 +372,7 @@ rule _controlfreec_config: "sed \"s|minQualPerPos|{params.minimalQualityPerPosition}|g\" | " "sed \"s|booNoise|{params.noisyData}|g\" | " "sed \"s|stepValue|{params.step}|g\" | " + "sed \"s|rcCountThresold|{params.readCountThreshold}|g\" | " "sed \"s|teloValue|{params.telocentromeric}|g\" | " "sed \"s|uniqBoo|{params.uniqBoo}|g\" | " "sed \"s|naBoo|{params.naBoo}|g\" | " @@ -266,81 +382,116 @@ rule _controlfreec_config: rule _controlfreec_run: input: - config = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt", + config = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt", tumour_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{tumour_id}.bam_minipileup.pileup.gz", normal_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{normal_id}.bam_minipileup.pileup.gz", output: - info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt", - ratio = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", - CNV = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", - BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt" + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt", + ratio = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + CNV = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", + BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt" conda: CFG["conda_envs"]["controlfreec"] threads: CFG["threads"]["controlfreec_run"] resources: **CFG["resources"]["controlfreec_run"] log: - stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/run.stdout.log", - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/run.stderr.log" + stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/run.stdout.log", + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/run.stderr.log" shell: "freec -conf {input.config} > {log.stdout} 2> {log.stderr} " rule _controlfreec_calc_sig: input: - CNVs = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", - ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + CNVs = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", output: - txt = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs.p.value.txt" + txt = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs.p.value.txt" params: calc_sig = CFG["software"]["FREEC_sig"] threads: CFG["threads"]["calc_sig"] resources: **CFG["resources"]["calc_sig"] conda: CFG["conda_envs"]["controlfreec"] log: - stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stdout.log", - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stderr.log" + stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stdout.log", + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stderr.log" shell: "cat {params.calc_sig} | R --slave --args {input.CNVs} {input.ratios} > {log.stdout} 2> {log.stderr}" rule _controlfreec_plot: input: - ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", - BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt", - info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt", + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" output: - plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.png", - log2plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.log2.png", - bafplot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt.png" + plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.png", + log2plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.log2.png", + bafplot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt.png" params: plot = CFG["software"]["FREEC_graph"] threads: CFG["threads"]["plot"] resources: **CFG["resources"]["plot"] conda: CFG["conda_envs"]["controlfreec"] log: - stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot.stdout.log", - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot.stderr.log" + stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/plot.stdout.log", + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/plot.stderr.log" shell: "cat {params.plot} | R --slave --args `grep \"Output_Ploidy\" {input.info} | cut -f 2` {input.ratios} {input.BAF} > {log.stdout} 2> {log.stderr} " rule _controlfreec_freec2bed: input: - ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", - info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" output: - bed = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bed" + bed = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bed" params: freec2bed = CFG["software"]["freec2bed"] threads: CFG["threads"]["freec2bed"] resources: **CFG["resources"]["freec2bed"] conda: CFG["conda_envs"]["controlfreec"] log: - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/freec2bed.stderr.log" + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/freec2bed.stderr.log" shell: "ploidy=$(grep Output_Ploidy {input.info} | cut -f 2); " "perl {params.freec2bed} -f {input.ratios} -p $ploidy > {output.bed} 2> {log.stderr}" +rule _controlfreec_freec2circos: + input: + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" + output: + circos = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.circos.bed" + params: + freec2circos = CFG["software"]["freec2circos"] + threads: CFG["threads"]["freec2circos"] + resources: **CFG["resources"]["freec2circos"] + conda: CFG["conda_envs"]["controlfreec"] + log: + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/freec2circos.stderr.log" + shell: + "ploidy=$(grep Output_Ploidy {input.info} | cut -f 2); " + "perl {params.freec2circos} -f {input.ratios} -p $ploidy > {output.circos} 2> {log.stderr}" + + +rule _controlfreec_cnv2igv: + input: + cnv = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs.p.value.txt" + output: + seg = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.CNVs.seg" + params: + tumour_id = "{tumour_id}", + cnv2igv = CFG["software"]["cnv2igv"] + threads: CFG["threads"]["cnv2igv"] + resources: **CFG["resources"]["cnv2igv"] + conda: CFG["conda_envs"]["controlfreec"] + log: + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/cnv2igv.stderr.log" + shell: + "python3 {params.cnv2igv} --mode controlfreec --sample {params.tumour_id} {input.cnv} > {output.seg} 2> {log.stderr} " + + # Symlinks the final output files into the module results directory (under '99-outputs/') rule _controlfreec_output: input: @@ -350,23 +501,29 @@ rule _controlfreec_output: bed = str(rules._controlfreec_freec2bed.output.bed), BAF = str(rules._controlfreec_run.output.BAF), BAFgraph = str(rules._controlfreec_plot.output.bafplot), - ratio = str(rules._controlfreec_run.output.ratio) + ratio = str(rules._controlfreec_run.output.ratio), + circos = str(rules._controlfreec_freec2circos.output.circos), + igv = str(rules._controlfreec_cnv2igv.output.seg) output: - plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/plots/{tumour_id}--{normal_id}--{pair_status}.ratio.png", - log2plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/log2plots/{tumour_id}--{normal_id}--{pair_status}.ratio.log2.png", - CNV = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/CNV/{tumour_id}--{normal_id}--{pair_status}.CNVs.p.value.txt", - bed = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/bed/{tumour_id}--{normal_id}--{pair_status}.CNVs.bed", - BAF = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/BAF/{tumour_id}--{normal_id}--{pair_status}.BAF.txt", - BAFgraph = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/BAFplot/{tumour_id}--{normal_id}--{pair_status}.BAF.png", - ratio = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/ratio/{tumour_id}--{normal_id}--{pair_status}.ratio.txt" + plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/plots/{tumour_id}--{normal_id}--{pair_status}.ratio.png", + log2plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/log2plots/{tumour_id}--{normal_id}--{pair_status}.ratio.log2.png", + CNV = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/CNV/{tumour_id}--{normal_id}--{pair_status}.CNVs.p.value.txt", + bed = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/bed/{tumour_id}--{normal_id}--{pair_status}.CNVs.bed", + BAF = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/BAF/{tumour_id}--{normal_id}--{pair_status}.BAF.txt", + BAFgraph = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/BAFplot/{tumour_id}--{normal_id}--{pair_status}.BAF.png", + ratio = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/ratio/{tumour_id}--{normal_id}--{pair_status}.ratio.txt", + circos = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/circos/{tumour_id}--{normal_id}--{pair_status}.circos.bed", + igv = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/igv/{tumour_id}--{normal_id}--{pair_status}.igv.seg" run: - op.relative_symlink(input.plot, output.plot) - op.relative_symlink(input.log2plot, output.log2plot) - op.relative_symlink(input.CNV, output.CNV) - op.relative_symlink(input.bed, output.bed) - op.relative_symlink(input.BAF, output.BAF) - op.relative_symlink(input.BAFgraph, output.BAFgraph) - op.relative_symlink(input.ratio, output.ratio) + op.relative_symlink(input.plot, output.plot, in_module = True) + op.relative_symlink(input.log2plot, output.log2plot, in_module = True) + op.relative_symlink(input.CNV, output.CNV, in_module = True) + op.relative_symlink(input.bed, output.bed, in_module = True) + op.relative_symlink(input.BAF, output.BAF, in_module = True) + op.relative_symlink(input.BAFgraph, output.BAFgraph, in_module = True) + op.relative_symlink(input.ratio, output.ratio, in_module = True) + op.relative_symlink(input.circos, output.circos, in_module = True) + op.relative_symlink(input.igv, output.igv, in_module = True) # Generates the target sentinels for each run, which generate the symlinks @@ -380,14 +537,17 @@ rule _controlfreec_all: str(rules._controlfreec_output.output.bed), str(rules._controlfreec_output.output.BAF), str(rules._controlfreec_output.output.BAFgraph), - str(rules._controlfreec_output.output.ratio) + str(rules._controlfreec_output.output.ratio), + str(rules._controlfreec_output.output.circos), + str(rules._controlfreec_output.output.igv) ], zip, # Run expand() with zip(), not product() seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], pair_status=CFG["runs"]["pair_status"], tumour_id=CFG["runs"]["tumour_sample_id"], - normal_id=CFG["runs"]["normal_sample_id"]) + normal_id=CFG["runs"]["normal_sample_id"], + masked=CFG["runs"]["masked"]) diff --git a/modules/controlfreec/1.2/etc/scripts/_makeGraph_Chromosome.R b/modules/controlfreec/1.2/src/_makeGraph_Chromosome.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/_makeGraph_Chromosome.R rename to modules/controlfreec/1.2/src/_makeGraph_Chromosome.R diff --git a/modules/controlfreec/1.2/etc/scripts/assess_significance.R b/modules/controlfreec/1.2/src/assess_significance.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/assess_significance.R rename to modules/controlfreec/1.2/src/assess_significance.R diff --git a/modules/controlfreec/1.2/etc/scripts/freec2bed.pl b/modules/controlfreec/1.2/src/freec2bed.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/freec2bed.pl rename to modules/controlfreec/1.2/src/freec2bed.pl diff --git a/modules/controlfreec/1.2/etc/scripts/freec2circos.pl b/modules/controlfreec/1.2/src/freec2circos.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/freec2circos.pl rename to modules/controlfreec/1.2/src/freec2circos.pl diff --git a/modules/controlfreec/1.2/etc/scripts/get_fasta_lengths.pl b/modules/controlfreec/1.2/src/get_fasta_lengths.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/get_fasta_lengths.pl rename to modules/controlfreec/1.2/src/get_fasta_lengths.pl diff --git a/modules/controlfreec/1.2/etc/scripts/makeGraph.R b/modules/controlfreec/1.2/src/makeGraph.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/makeGraph.R rename to modules/controlfreec/1.2/src/makeGraph.R diff --git a/modules/controlfreec/1.2/etc/scripts/makeGraph_Chromosome.R b/modules/controlfreec/1.2/src/makeGraph_Chromosome.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/makeGraph_Chromosome.R rename to modules/controlfreec/1.2/src/makeGraph_Chromosome.R diff --git a/modules/controlfreec/1.2/etc/scripts/vcf2snpFreec.pl b/modules/controlfreec/1.2/src/vcf2snpFreec.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/vcf2snpFreec.pl rename to modules/controlfreec/1.2/src/vcf2snpFreec.pl diff --git a/modules/controlfreec/CHANGELOG.md b/modules/controlfreec/CHANGELOG.md index e1b646c2..d20964b8 100755 --- a/modules/controlfreec/CHANGELOG.md +++ b/modules/controlfreec/CHANGELOG.md @@ -58,4 +58,9 @@ Notably, in paired mode, with BAF mode on, FREEC normalizes with GC-content, and This implementation has been tested on unmatched samples too using a high coverage, normal FFPE sample, and it has shown to display clean profiles in these cases too. -Note: this version is not meant for capture/exome data. \ No newline at end of file +Note: this version is not meant for capture/exome data. + +## [1.2] patch 2021-02-25 +Added GEM mappability features - can now use/generate a hard-masked mappability file (useful for FFPE genomes) with the setting "hard_masked" = True. If this is set, GEM will be installed and ran on your reference genome of choice. + +Also added freec2circos function. \ No newline at end of file diff --git a/modules/gatk_rnaseq/1.0/config/default.yaml b/modules/gatk_rnaseq/1.0/config/default.yaml index 72b4d793..bc7561a5 100644 --- a/modules/gatk_rnaseq/1.0/config/default.yaml +++ b/modules/gatk_rnaseq/1.0/config/default.yaml @@ -67,11 +67,10 @@ lcr-modules: mem_mb: 12000 bam: 1 gatk_variant_calling: - mem_mb: 48000 + mem_mb: 12000 bam: 1 gatk_variant_filtration: mem_mb: 12000 - bam: 1 merge_vcfs: mem_mb: 10000 gatk_rnaseq_passed: diff --git a/modules/gatk_rnaseq/1.0/gatk_rnaseq.smk b/modules/gatk_rnaseq/1.0/gatk_rnaseq.smk index 493a5504..f6f88023 100644 --- a/modules/gatk_rnaseq/1.0/gatk_rnaseq.smk +++ b/modules/gatk_rnaseq/1.0/gatk_rnaseq.smk @@ -60,10 +60,12 @@ rule _gatk_rnaseq_input_bam: bai = CFG["inputs"]["sample_bai"] output: bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", - bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bai" + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai", run: op.absolute_symlink(input.bam, output.bam) op.absolute_symlink(input.bai, output.bai) + op.absolute_symlink(input.bai, output.crai) rule _gatk_splitntrim: @@ -71,7 +73,8 @@ rule _gatk_splitntrim: bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") output: - bam = temp(CFG["dirs"]["gatk_splitntrim"] + "bam/{seq_type}--{genome_build}/{sample_id}.split_reassign_mq.bam") + bam = temp(CFG["dirs"]["gatk_splitntrim"] + "bam/{seq_type}--{genome_build}/{sample_id}.split_reassign_mq.bam"), + bai = temp(CFG["dirs"]["gatk_splitntrim"] + "bam/{seq_type}--{genome_build}/{sample_id}.split_reassign_mq.bai") log: stdout = CFG["logs"]["gatk_splitntrim"] + "{seq_type}--{genome_build}/{sample_id}.gatk_splitntrim.stdout.log", stderr = CFG["logs"]["gatk_splitntrim"] + "{seq_type}--{genome_build}/{sample_id}.gatk_splitntrim.stderr.log" @@ -81,6 +84,8 @@ rule _gatk_splitntrim: gatk_opts = CFG["options"]["gatk_splitntrim"] conda: CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" + priority: 50 threads: CFG["threads"]["gatk_splitntrim"] resources: @@ -94,9 +99,11 @@ rule _gatk_splitntrim: rule _gatk_addRG: input: - bam = str(rules._gatk_splitntrim.output) + bam = str(rules._gatk_splitntrim.output.bam), + bai = str(rules._gatk_splitntrim.output.bai) output: - bam = temp(CFG["dirs"]["gatk_splitntrim"] + "bam_withRG/{seq_type}--{genome_build}/{sample_id}.withRG.bam") + bam = temp(CFG["dirs"]["gatk_splitntrim"] + "bam_withRG/{seq_type}--{genome_build}/{sample_id}.withRG.bam"), + bai = temp(CFG["dirs"]["gatk_splitntrim"] + "bam_withRG/{seq_type}--{genome_build}/{sample_id}.withRG.bam.bai") params: sampleName = "{sample_id}", platform = CFG["options"]["gatk_addRG"]["platform"], @@ -104,6 +111,8 @@ rule _gatk_addRG: stringency = CFG["options"]["gatk_addRG"]["stringency"] conda: CFG["conda_envs"]["picard"] + group: "split_bam" + priority: 40 log: stdout = CFG["logs"]["gatk_splitntrim"] + "bam_withRG/{seq_type}--{genome_build}/{sample_id}.addRG.stdout.log" threads: @@ -118,7 +127,7 @@ rule _gatk_addRG: rule _gatk_base_recalibration: input: - bam = str(rules._gatk_addRG.output), + bam = str(rules._gatk_addRG.output.bam), fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") output: table = CFG["dirs"]["base_recal_report"] + "{seq_type}--{genome_build}/{sample_id}.recalibration_report.grp" @@ -133,6 +142,8 @@ rule _gatk_base_recalibration: gatk_opts = CFG["options"]["gatk_baserecalibrator"] conda: CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" + priority: 30 threads: CFG["threads"]["gatk_base_recalibration"] resources: **CFG["resources"]["gatk_base_recalibration"] @@ -145,6 +156,7 @@ rule _gatk_base_recalibration: rule _gatk_applybqsr: input: bam = str(rules._gatk_addRG.output.bam), + bai = str(rules._gatk_addRG.output.bai), table = str(rules._gatk_base_recalibration.output.table), fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") output: @@ -159,6 +171,8 @@ rule _gatk_applybqsr: gatk_opts = CFG["options"]["gatk_applybqsr"] conda: CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" + priority: 20 threads: CFG["threads"]["gatk_applybqsr"] resources: **CFG["resources"]["gatk_applybqsr"] @@ -188,6 +202,7 @@ rule _gatk_variant_calling: gatk_opts = CFG["options"]["gatk_variant_calling"]["gatk_opts"] conda: CFG["conda_envs"]["gatk_rnaseq"] + group: "split_bam" threads: CFG["threads"]["gatk_variant_calling"] resources: **CFG["resources"]["gatk_variant_calling"] diff --git a/modules/gridss/1.1/gridss.smk b/modules/gridss/1.1/gridss.smk index 51483f74..b4935ce1 100644 --- a/modules/gridss/1.1/gridss.smk +++ b/modules/gridss/1.1/gridss.smk @@ -15,6 +15,26 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["gridss"]` @@ -24,13 +44,13 @@ CFG = op.setup_module( subdirectories = ["inputs", "preprocess", "gridss", "viral_annotation", "gripss", "outputs"], ) -VERSION_MAP = { +GRIDSS_VERSION_MAP = { "grch37": "hg19", "hs37d5": "hg19", "hg38": "hg38" } -possible_genome_builds = VERSION_MAP.keys() +possible_genome_builds = GRIDSS_VERSION_MAP.keys() for genome_build in CFG["runs"]["tumour_genome_build"]: assert genome_build in possible_genome_builds, ( "Samples table includes genome builds not yet compatible with this module. " @@ -85,7 +105,7 @@ rule _gridss_get_pon: pon_breakend = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/gridss_pon_single_breakend.bed", known_pairs = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/KnownFusionPairs.bedpe" params: - alt_build = lambda w: VERSION_MAP[w.genome_build], + alt_build = lambda w: GRIDSS_VERSION_MAP[w.genome_build], url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/gridss/pon" shell: op.as_one_line(""" @@ -162,8 +182,8 @@ rule _gridss_input_bam: sample_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", sample_bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai" run: - op.relative_symlink(input.sample_bam, output.sample_bam) - op.relative_symlink(input.sample_bai, output.sample_bai) + op.absolute_symlink(input.sample_bam, output.sample_bam) + op.absolute_symlink(input.sample_bai, output.sample_bai) # Preprocess unmatched normal bams rule _gridss_preprocess_unmatched_normal: @@ -206,12 +226,12 @@ rule _gridss_symlink_preprocessed_normal: input: workdir = str(rules._gridss_preprocess_unmatched_normal.output.workdir) output: - workdir = temp(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{sample_id}.bam.gridss.working") + workdir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{sample_id}.bam.gridss.working")) priority: 0 wildcard_constraints: sample_id = "|".join(unmatched_normal_ids) run: - op.relative_symlink(input.workdir, output.workdir) + op.absolute_symlink(input.workdir, output.workdir) # Preprocess all other bams as part of the group job rule _gridss_preprocess: @@ -488,9 +508,9 @@ rule _gridss_output_viral_vcf: tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_viral_annotation_filtered.vcf.gz.tbi", bedpe = CFG["dirs"]["outputs"] + "bedpe/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_viral_annotation_filtered.bedpe" run: - op.relative_symlink(input.vcf, output.vcf) - op.relative_symlink(input.tbi, output.tbi) - op.relative_symlink(input.bedpe, output.bedpe) + op.relative_symlink(input.vcf, output.vcf, in_module=True) + op.relative_symlink(input.tbi, output.tbi, in_module=True) + op.relative_symlink(input.bedpe, output.bedpe, in_module=True) rule _gridss_output_somatic_vcf: input: @@ -506,11 +526,11 @@ rule _gridss_output_somatic_vcf: filtered_tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.vcf.gz.tbi", bedpe = CFG["dirs"]["outputs"] + "bedpe/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.bedpe" run: - op.relative_symlink(input.somatic, output.somatic) - op.relative_symlink(input.somatic_tbi, output.somatic_tbi) - op.relative_symlink(input.filtered, output.filtered) - op.relative_symlink(input.filtered_tbi, output.filtered_tbi) - op.relative_symlink(input.bedpe, output.bedpe) + op.relative_symlink(input.somatic, output.somatic, in_module=True) + op.relative_symlink(input.somatic_tbi, output.somatic_tbi, in_module=True) + op.relative_symlink(input.filtered, output.filtered, in_module=True) + op.relative_symlink(input.filtered_tbi, output.filtered_tbi, in_module=True) + op.relative_symlink(input.bedpe, output.bedpe, in_module=True) def _gridss_predict_output(wildcards): """Request symlinks for all VCF files. diff --git a/modules/gridss/2.0/config/default.yaml b/modules/gridss/2.0/config/default.yaml new file mode 100644 index 00000000..d1cf1624 --- /dev/null +++ b/modules/gridss/2.0/config/default.yaml @@ -0,0 +1,62 @@ +lcr-modules: + + gridss: + + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + + scratch_subdirectories: [] # Recommended: ["gridss", "preprocess"] + + options: + gridss: + --picardoptions VALIDATION_STRINGENCY=SILENT + filter_unpaired: + gripss: + # Hard filters remove variants from output VCF + # Soft filters add flags to output VCF + # These flags don't work with the current version of GRIPSS + # A fix is being prepared by the developers + -hard_max_normal_absolute_support 3 + -hard_max_normal_relative_support 0.06 + -soft_max_normal_relative_support 0.03 + + conda_envs: + wget: "{MODSDIR}/envs/wget-1.20.1.yaml" + gridss: "{MODSDIR}/envs/gridss-2.12.0.yaml" + gripss: "{MODSDIR}/envs/hmftools-gripss-1.11.yaml" + bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml" + svtools: "{MODSDIR}/envs/svtools-0.5.1.yaml" + + threads: + preprocess: 8 + gridss: 24 + repeatmasker: 24 + filter_gridss: 1 + gripss: 1 # Not multi-threaded + split: 1 + + resources: + preprocess: + mem_mb: 37500 + preprocess: 1 + gridss: + mem_mb: 37500 # Recommended per GRIDSS manual + gridss: 1 + repeatmasker: + mem_mb: 100000 + gripss: + mem_mb: 20000 # May need to be increased for FFPE tumours + split: + mem_mb: 2000 + + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False + capture: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False diff --git a/modules/gridss/2.0/envs/bcftools-1.10.2.yaml b/modules/gridss/2.0/envs/bcftools-1.10.2.yaml new file mode 120000 index 00000000..72959e7b --- /dev/null +++ b/modules/gridss/2.0/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/gridss-2.12.0.yaml b/modules/gridss/2.0/envs/gridss-2.12.0.yaml new file mode 120000 index 00000000..d827cc39 --- /dev/null +++ b/modules/gridss/2.0/envs/gridss-2.12.0.yaml @@ -0,0 +1 @@ +../../../../envs/gridss/gridss-2.12.0.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/gridss-dependencies-2.9.4.yaml b/modules/gridss/2.0/envs/gridss-dependencies-2.9.4.yaml new file mode 120000 index 00000000..b7fb269d --- /dev/null +++ b/modules/gridss/2.0/envs/gridss-dependencies-2.9.4.yaml @@ -0,0 +1 @@ +../../../../envs/gridss/gridss-dependencies-2.9.4.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/hmftools-gripss-1.11.yaml b/modules/gridss/2.0/envs/hmftools-gripss-1.11.yaml new file mode 120000 index 00000000..6a1656b5 --- /dev/null +++ b/modules/gridss/2.0/envs/hmftools-gripss-1.11.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-gripss-1.11.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/hmftools-gripss-1.4.0.yaml b/modules/gridss/2.0/envs/hmftools-gripss-1.4.0.yaml new file mode 120000 index 00000000..ca91e8c3 --- /dev/null +++ b/modules/gridss/2.0/envs/hmftools-gripss-1.4.0.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-gripss-1.4.0.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/hmftools-gripss-1.8.yaml b/modules/gridss/2.0/envs/hmftools-gripss-1.8.yaml new file mode 120000 index 00000000..b0c2af4a --- /dev/null +++ b/modules/gridss/2.0/envs/hmftools-gripss-1.8.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-gripss-1.8.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/svtools-0.5.1.yaml b/modules/gridss/2.0/envs/svtools-0.5.1.yaml new file mode 120000 index 00000000..6dc2ec0c --- /dev/null +++ b/modules/gridss/2.0/envs/svtools-0.5.1.yaml @@ -0,0 +1 @@ +../../../../envs/svtools/svtools-0.5.1.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/wget-1.20.1.yaml b/modules/gridss/2.0/envs/wget-1.20.1.yaml new file mode 120000 index 00000000..86501e72 --- /dev/null +++ b/modules/gridss/2.0/envs/wget-1.20.1.yaml @@ -0,0 +1 @@ +../../../../envs/wget/wget-1.20.1.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/gridss.smk b/modules/gridss/2.0/gridss.smk new file mode 100644 index 00000000..99cbdd27 --- /dev/null +++ b/modules/gridss/2.0/gridss.smk @@ -0,0 +1,508 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: Laura Hilton +# Module Author: Laura Hilton +# Contributors: N/A + + +##### SETUP ##### + + +# Import package with useful functions for developing analysis modules +import oncopipe as op + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["gridss"]` +CFG = op.setup_module( + name = "gridss", + version = "2.0", + subdirectories = ["inputs", "preprocess", "gridss", "repeatmasker", "gripss", "outputs"], +) + +VERSION_MAP_GRIDSS = { + "grch37": "hg19", + "hs37d5": "hg19", + "hg38": "hg38" +} + +possible_genome_builds = VERSION_MAP_GRIDSS.keys() +for genome_build in CFG["runs"]["tumour_genome_build"]: + assert genome_build in possible_genome_builds, ( + "Samples table includes genome builds not yet compatible with this module. " + "This module is currently only compatible with {possible_genome_builds}. " + ) + +sample_ids = list(CFG['samples']['sample_id']) +unmatched_normal_ids = list(config["lcr-modules"]["_shared"]["unmatched_normal_ids"].values()) +all_other_ids = list(set(sample_ids) - set(unmatched_normal_ids)) + +# Define rules to be run locally when using a compute cluster +localrules: + _gridss_input_bam, + _gridss_input_references, + _gridss_setup_references, + _gridss_get_pon, + _gridss_symlink_preprocessed_normal, + _gridss_filter_gripss, + _gridss_gripss_to_bedpe, + _gridss_output_somatic_vcf, + _gridss_all + + + +##### RULES ##### + +# Symlink genome fasta with bwa and .fai indices to the same directory +rule _gridss_input_references: + input: + genome_fa = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), + genome_bwa_prefix = reference_files("genomes/{genome_build}/bwa_index/bwa-0.7.17/genome.fa"), + output: + genome_fa = CFG["dirs"]["inputs"] + "references/{genome_build}/genome_fa/genome.fa", + shell: + op.as_one_line(""" + ln -sf {input.genome_fa} {output.genome_fa} && + ln -sf {input.genome_fa}.fai {output.genome_fa}.fai && + ln -sf {input.genome_bwa_prefix}.* `dirname {output.genome_fa}` + """) + +# Download the panel of normals +rule _gridss_get_pon: + output: + pon_breakpoint = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/gridss_pon_breakpoint.bedpe", + pon_breakend = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/gridss_pon_single_breakend.bed", + known_pairs = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/KnownFusionPairs.bedpe" + params: + alt_build = lambda w: VERSION_MAP_GRIDSS[w.genome_build], + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/gridss/pon" + shell: + op.as_one_line(""" + wget -O {output.pon_breakpoint} {params.url}/gridss_pon_breakpoint.{params.alt_build}.bedpe; + wget -O {output.pon_breakend} {params.url}/gridss_pon_single_breakend.{params.alt_build}.bed; + wget -O {output.known_pairs} {params.url}/KnownFusionPairs.{params.alt_build}.bedpe + """) + + +# Generage genome.fa.img file +rule _gridss_setup_references: + input: + fasta = str(rules._gridss_input_references.output.genome_fa), + output: + genome_img = CFG["dirs"]["inputs"] + "references/{genome_build}/genome_fa/genome.fa.img" + params: + steps = "setupreference" + conda: + CFG["conda_envs"]["gridss"] + resources: + mem_mb = 4000 + threads: 8 + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --threads {threads} + --jvmheap 3G + --steps {params.steps} + --workingdir `dirname {output.genome_img}` + """) + + +# Symlink the input files into the module results directory (under '00-inputs/') +rule _gridss_input_bam: + input: + sample_bam = CFG["inputs"]["sample_bam"], + sample_bai = CFG["inputs"]["sample_bai"] + output: + sample_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + sample_bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai" + run: + op.absolute_symlink(input.sample_bam, output.sample_bam) + op.absolute_symlink(input.sample_bai, output.sample_bai) + +# Preprocess unmatched normal bams +rule _gridss_preprocess_unmatched_normal: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + fasta = str(rules._gridss_input_references.output.genome_fa), + fasta_img = str(rules._gridss_setup_references.output.genome_img) + output: + workdir = directory(CFG["dirs"]["preprocess"] + "{seq_type}--{genome_build}/{sample_id}.bam.gridss.working") + log: CFG["logs"]["preprocess"] + "{seq_type}--{genome_build}/{sample_id}/preprocess.log" + params: + opts = CFG["options"]["gridss"], + steps = "preprocess", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["gridss"] + resources: + **CFG["resources"]["gridss"] + priority: 1 + wildcard_constraints: + sample_id="|".join(unmatched_normal_ids) + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --workingdir $(dirname {output.workdir}) + --threads {threads} + --jvmheap {params.mem_mb}m + --steps {params.steps} + {params.opts} + {input.bam} + 2>&1 | tee -a {log} + """) + +# Symlink preprocessed sv.bam directories + +rule _gridss_symlink_preprocessed_normal: + input: + workdir = str(rules._gridss_preprocess_unmatched_normal.output.workdir) + output: + workdir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{sample_id}.bam.gridss.working")) + priority: 0 + wildcard_constraints: + sample_id = "|".join(unmatched_normal_ids) + run: + op.absolute_symlink(input.workdir, output.workdir) + +# Preprocess all other bams as part of the group job +rule _gridss_preprocess: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + fasta = str(rules._gridss_input_references.output.genome_fa), + fasta_img = str(rules._gridss_setup_references.output.genome_img) + output: + workdir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{sample_id}.bam.gridss.working")) + log: CFG["logs"]["preprocess"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{sample_id}/preprocess.log" + params: + opts = CFG["options"]["gridss"], + steps = "preprocess", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["preprocess"] + resources: + **CFG["resources"]["preprocess"] + group: "enormous_bam" + wildcard_constraints: + sample_id = "|".join(all_other_ids) + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --workingdir $(dirname {output.workdir}) + --threads {threads} + --jvmheap {params.mem_mb}m + --steps {params.steps} + {params.opts} + {input.bam} + 2>&1 | tee -a {log} + """) + +def get_input_per_patient(wildcards): + CFG = config['lcr-modules']['gridss'] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id) + if wildcards.pair_status in ["matched", "unmatched"]: + SAMPLES = PATIENT['normal_sample_id'].unique().tolist() + PATIENT['tumour_sample_id'].tolist() + bams = expand( + [ + str(rules._gridss_input_bam.output.sample_bam) + ], + zip, + sample_id = SAMPLES, + allow_missing = True + ) + preproc = expand( + [ + str(rules._gridss_preprocess.output.workdir) + ], + zip, + sample_id = SAMPLES, + allow_missing = True + ) + elif wildcards.pair_status == "no_normal": + bams = expand( + [ + str(rules._gridss_input_bam.output.sample_bam) + ], + zip, + sample_id = PATIENT["tumour_sample_id"], + allow_missing = True + ) + preproc = expand( + [ + str(rules._gridss_preprocess.output.workdir) + ], + zip, + sample_id = PATIENT["tumour_sample_id"], + allow_missing = True + ) + return {'bams': bams, 'preproc': preproc} + +def get_input_sample_ids(wildcards): + CFG = config['lcr-modules']['gridss'] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id) + if wildcards.pair_status in ["matched", "unmatched"]: + ids = ",".join([",".join(PATIENT['normal_sample_id'].unique().tolist()), ",".join(PATIENT['tumour_sample_id'].tolist())]) + elif wildcards.pair_status == "no_normal": + ids = ",".join(PATIENT['tumour_sample_id']) + return ids + +# Run GRIDSS in paired mode +rule _gridss_run: + input: + unpack(get_input_per_patient), + fasta = str(rules._gridss_input_references.output.genome_fa), + fasta_img = str(rules._gridss_setup_references.output.genome_img), + blacklist = reference_files("genomes/{genome_build}/encode/encode-blacklist.{genome_build}.bed") + output: + vcf = temp(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_raw.vcf.gz"), + assembly = temp(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/assembly.bam"), + assembly_dir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/assembly.bam.gridss.working")), + vcf_dir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_raw.vcf.gz.gridss.working")) + log: CFG["logs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss.log" + params: + ids = lambda wildcards: get_input_sample_ids(wildcards), + opts = CFG["options"]["gridss"], + steps = "assemble,call", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["gridss"] + resources: + **CFG["resources"]["gridss"] + group: "enormous_bam" + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --output {output.vcf} + --workingdir `dirname {output.vcf}` + --assembly {output.assembly} + --blacklist {input.blacklist} + --threads {threads} + --jvmheap {params.mem_mb}m + --labels "{params.ids}" + --steps {params.steps} + {params.opts} + {input.bams} + 2>&1 | tee -a {log} + """) + +# Annotate GRIDSS VCF with Repeatmasker +rule _gridss_annotate_repeatmasker: + input: + vcf = str(rules._gridss_run.output.vcf) + output: + vcf = temp(CFG["dirs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_repeatmasker.vcf.gz"), + tbi = temp(CFG["dirs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_repeatmasker.vcf.gz.tbi") + log: CFG["logs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_repeatmasker.log" + params: + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["repeatmasker"] + resources: + **CFG["resources"]["repeatmasker"] + shell: + op.as_one_line(""" + gridss_annotate_vcf_repeatmasker + -o {output.vcf} + -t {threads} + -w $(dirname {output.vcf}) + {input.vcf} + > {log} 2>&1 + """) + +def get_split_ids(wildcards): + CFG = config['lcr-modules']['gridss'] + if wildcards.normal_id == "None": + return wildcards.tumour_id + else: + return wildcards.normal_id + "," + wildcards.tumour_id + +rule _gridss_split_vcf: + input: + vcf = str(rules._gridss_run.output.vcf) + output: + vcf = temp(CFG['dirs']['repeatmasker'] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.gridss_split.vcf.gz"), + tbi = temp(CFG['dirs']['repeatmasker'] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.gridss_split.vcf.gz.tbi") + log: CFG["logs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.gridss_split_vcf.log" + params: + ids = lambda wildcards: get_split_ids(wildcards), + conda: + CFG["conda_envs"]["bcftools"] + threads: CFG['threads']['split'] + resources: + **CFG['resources']['split'] + shell: + op.as_one_line(""" + bcftools view -s {params.ids} -Oz -o {output.vcf} {input.vcf} 2> {log} && + tabix -p vcf {output.vcf} + """) + +def get_split_vcf(wildcards): + CFG = config['lcr-modules']['gridss'] + TUMOUR = op.filter_samples(CFG['runs'], tumour_sample_id = wildcards.tumour_id) + vcf = expand( + str(rules._gridss_split_vcf.output.vcf), + patient_id = TUMOUR['tumour_patient_id'], + allow_missing = True + ) + return {'vcf': vcf} + +def get_gripss_sample_id_cli(wildcards): + CFG = config['lcr-modules']['gridss'] + TUMOUR = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id) + if wildcards.pair_status in ["matched", "unmatched"]: + return "-tumor " + str("".join(TUMOUR['tumour_sample_id'])) + " -reference " + str("".join(TUMOUR['normal_sample_id'])) + elif wildcards.pair_status == "no_normal": + return "-tumor " + str("".join(TUMOUR['tumour_sample_id'])) + +# Perform somatic filtering against the panel of normals +rule _gridss_run_gripss: + input: + unpack(get_split_vcf), + fasta = str(rules._gridss_input_references.output.genome_fa), + pon_breakend = str(rules._gridss_get_pon.output.pon_breakend), + pon_breakpoint = str(rules._gridss_get_pon.output.pon_breakpoint), + known_pairs = str(rules._gridss_get_pon.output.known_pairs) + output: + vcf = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz", + tbi = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz.tbi" + log: log = CFG["logs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gripss.log" + resources: + **CFG["resources"]["gripss"] + params: + cli = lambda wildcards: get_gripss_sample_id_cli(wildcards), + opts = CFG["options"]["gripss"], + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gripss"] + threads: + CFG["threads"]["gripss"] + shell: + op.as_one_line(""" + gripss -Xms4G -Xmx{params.mem_mb}m + -ref_genome {input.fasta} + -breakend_pon {input.pon_breakend} + -breakpoint_pon {input.pon_breakpoint} + -breakpoint_hotspot {input.known_pairs} + -input_vcf {input.vcf} + -output_vcf {output.vcf} + {params.cli} + {params.opts} + 2>&1 | tee -a {log} + """) + +rule _gridss_filter_gripss: + input: + vcf = str(rules._gridss_run_gripss.output.vcf) + output: + vcf = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz", + tbi = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz.tbi" + conda: + CFG["conda_envs"]["bcftools"] + shell: + op.as_one_line(""" + zcat {input.vcf} | + awk '$7 == "PASS" || $1 ~ /^#/ ' | + bcftools view -Oz -o {output.vcf} && + tabix -p vcf {output.vcf} + """) + +rule _gridss_gripss_to_bedpe: + input: + vcf = str(rules._gridss_filter_gripss.output.vcf) + output: + bedpe = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.bedpe" + conda: + CFG["conda_envs"]["svtools"] + shell: + op.as_one_line(""" + zcat {input.vcf} | + awk '$1 ~ /^#/ || $5 ~ /:/' | + svtools vcftobedpe | grep -v "##" > {output.bedpe} + """) + + +# Symlink the final output files into the module results directory (under '99-outputs/') +rule _gridss_output_somatic_vcf: + input: + filtered = str(rules._gridss_filter_gripss.output.vcf), + filtered_tbi = str(rules._gridss_filter_gripss.output.tbi), + somatic = str(rules._gridss_run_gripss.output.vcf), + somatic_tbi = str(rules._gridss_run_gripss.output.tbi), + bedpe = str(rules._gridss_gripss_to_bedpe.output.bedpe) + output: + somatic = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic.vcf.gz", + somatic_tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic.vcf.gz.tbi", + filtered = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.vcf.gz", + filtered_tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.vcf.gz.tbi", + bedpe = CFG["dirs"]["outputs"] + "bedpe/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.bedpe" + run: + op.relative_symlink(input.somatic, output.somatic, in_module=True) + op.relative_symlink(input.somatic_tbi, output.somatic_tbi, in_module=True) + op.relative_symlink(input.filtered, output.filtered, in_module=True) + op.relative_symlink(input.filtered_tbi, output.filtered_tbi, in_module=True) + op.relative_symlink(input.bedpe, output.bedpe, in_module=True) + + + + +# Generates the target sentinels for each run, which generate the symlinks +rule _gridss_all: + input: + expand( + [ + str(rules._gridss_output_somatic_vcf.output.filtered), + str(rules._gridss_output_somatic_vcf.output.filtered_tbi), + str(rules._gridss_output_somatic_vcf.output.somatic), + str(rules._gridss_output_somatic_vcf.output.somatic_tbi), + str(rules._gridss_output_somatic_vcf.output.bedpe) + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"] + ) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/gridss/2.0/schemas/base-1.0.yaml b/modules/gridss/2.0/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/gridss/2.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/gridss/CHANGELOG.md b/modules/gridss/CHANGELOG.md index f8be466c..b260775b 100644 --- a/modules/gridss/CHANGELOG.md +++ b/modules/gridss/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to the `gridss` module will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.0] - 2021-12-29 +This release was authored by Laura Hilton. +- Implementing joint calling per patient for multi-timepoint samples. +- Module updates enable CRAM support. + ## [1.1] - 2020-10-09 This release was authored by Laura Hilton. See the [GRIDSS man page](https://github.com/PapenfussLab/gridss) for extensive documentation. - Add automatic reference file downloading from files hosted at the BCGSC [downloads page](https://bcgsc.ca/downloads/morinlab/hmftools-references/gridss/). diff --git a/modules/hmftools/1.0/hmftools.smk b/modules/hmftools/1.0/hmftools.smk index f8163d26..352ba893 100644 --- a/modules/hmftools/1.0/hmftools.smk +++ b/modules/hmftools/1.0/hmftools.smk @@ -15,6 +15,26 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["hmftools"]` CFG = op.setup_module( @@ -40,23 +60,19 @@ localrules: _hmftools_all -VERSION_MAP = { +HMFTOOLS_VERSION_MAP = { "grch37": "hg19", "hs37d5": "hg19", "hg38": "hg38" } -possible_genome_builds = VERSION_MAP.keys() +possible_genome_builds = HMFTOOLS_VERSION_MAP.keys() for genome_build in CFG["runs"]["tumour_genome_build"]: assert genome_build in possible_genome_builds, ( "Samples table includes genome builds not yet compatible with this module. " "This module is currently only compatible with {possible_genome_builds}. " ) -wildcard_constraints: - genome_build = "|".join(VERSION_MAP.keys()), - pair_status = "matched|unmatched" - ##### RULES ##### @@ -69,9 +85,12 @@ rule _hmftools_input_bam: output: bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bai", + wildcard_constraints: + genome_build = "|".join(HMFTOOLS_VERSION_MAP.keys()), + pair_status = "matched|unmatched" run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bai) + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) rule _hmftools_input_strelka: input: @@ -79,7 +98,7 @@ rule _hmftools_input_strelka: output: strelka_vcf = CFG["dirs"]["inputs"] + "strelka_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic.combined.vcf.gz" run: - op.relative_symlink(input.strelka_vcf, output.strelka_vcf) + op.absolute_symlink(input.strelka_vcf, output.strelka_vcf) rule _hmftools_input_gridss: input: @@ -93,10 +112,10 @@ rule _hmftools_input_gridss: gridss_filtered_vcf = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz", gridss_filtered_tbi = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz.tbi" run: - op.relative_symlink(input.gridss_somatic_vcf, output.gridss_somatic_vcf) - op.relative_symlink(input.gridss_somatic_tbi, output.gridss_somatic_tbi) - op.relative_symlink(input.gridss_filtered_vcf, output.gridss_filtered_vcf) - op.relative_symlink(input.gridss_filtered_tbi, output.gridss_filtered_tbi) + op.absolute_symlink(input.gridss_somatic_vcf, output.gridss_somatic_vcf) + op.absolute_symlink(input.gridss_somatic_tbi, output.gridss_somatic_tbi) + op.absolute_symlink(input.gridss_filtered_vcf, output.gridss_filtered_vcf) + op.absolute_symlink(input.gridss_filtered_tbi, output.gridss_filtered_tbi) # Rules to download and setup reference files @@ -121,7 +140,7 @@ rule _hmftools_get_cobalt_gc: gc = CFG["dirs"]["inputs"] + "references/{genome_build}/cobalt/GC_profile.1000bp.cnp" params: url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/cobalt", - alt_build = lambda w: VERSION_MAP[w.genome_build] + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build] conda: CFG["conda_envs"]["wget"] shell: @@ -133,7 +152,7 @@ rule _hmftools_get_amber_snps: snpcheck = CFG["dirs"]["inputs"] + "references/{genome_build}/amber/GermlineHetPon.snpcheck.vcf.gz" params: url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/amber", - alt_build = lambda w: VERSION_MAP[w.genome_build] + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build] conda: CFG["conda_envs"]["wget"] shell: @@ -146,7 +165,7 @@ rule _hmftools_get_purple_drivers: gene_panel = CFG["dirs"]["inputs"] + "references/{genome_build}/purple/DriverGenePanel.tsv" params: url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/purple", - alt_build = lambda w: VERSION_MAP[w.genome_build] + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build] conda: CFG["conda_envs"]["wget"] shell: @@ -493,7 +512,7 @@ rule _hmftools_linx: resources: **CFG["resources"]["linx"] params: - alt_build = lambda w: VERSION_MAP[w.genome_build], + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build], ensembl_build = lambda w: { "grch37": "HG37", "hs37d5": "HG37", @@ -628,7 +647,7 @@ rule _hmftools_purple_output: output: files = CFG["dirs"]["outputs"] + "purple_output/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.purple.{out_file}" run: - op.relative_symlink(input.files, output.files) + op.relative_symlink(input.files, output.files, in_module=True) rule _hmftools_purple_plots: input: @@ -636,7 +655,7 @@ rule _hmftools_purple_plots: output: plots = CFG["dirs"]["outputs"] + "purple_plots/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.{plot_name}.png" run: - op.relative_symlink(input.plots, output.plots) + op.relative_symlink(input.plots, output.plots, in_module=True) rule _hmftools_linx_plots: diff --git a/modules/hmftools/1.1/config/default.yaml b/modules/hmftools/1.1/config/default.yaml new file mode 100644 index 00000000..5df6e668 --- /dev/null +++ b/modules/hmftools/1.1/config/default.yaml @@ -0,0 +1,101 @@ +lcr-modules: + + hmftools: + + # TODO: Update the list of available wildcards, if applicable + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + # Available wildcards: {seq_type} {genome_build} {tumour_id} + # Note: SLMS-3 outputs are recommended. + # The pipeline will take any VCF where the samples are labeled 'TUMOR' and 'NORMAL', + # and where the VCF is annotated with "AD" and "DP" fields. + # Must be in bgzip with `.vcf.gz` extension. + slms3_vcf: "__UPDATE__" + # Available wildcards: {seq_type} {genome_build} {sample_id} + # Note: These are output by the gripss somatic filtering step of the gridss module + gridss_somatic: "__UPDATE__" # Output of GRIPSS + gridss_somatic_tbi: "__UPDATE" + gridss_somatic_filtered: "__UPDATE__" # Filtered output of GRIPSS + gridss_somatic_filtered_tbi: "__UPDATE__" + + scratch_subdirectories: [] + + switches: + ensembl_url: + '37': "mysql://ensembldb.ensembl.org:3337/homo_sapiens_core_89_37" + '38': "mysql://ensembldb.ensembl.org:3306/homo_sapiens_core_98_38" + + options: + use_masked_ref: False + amber: + -validation_stringency SILENT + cobalt: + -validation_stringency SILENT + purple: "" + linx: "" + linx_viz: + -fusion_legend_height_per_row 70 + -segment_relative_size 0.5 + -outer_radius 0.85 + -min_line_size 4 + -max_line_size 18 + -min_label_size 45 + -max_label_size 50 + -glyph_size 25 + -exon_rank_radius 0.04 + -max_gene_characters 15 + linx_viz_annotate: + -fusion_legend_height_per_row 70 + -segment_relative_size 0.5 + -outer_radius 0.85 + -min_line_size 4 + -max_line_size 18 + -min_label_size 45 + -max_label_size 50 + -glyph_size 25 + -exon_rank_radius 0.04 + -max_gene_characters 15 + + conda_envs: + samtools: "{MODSDIR}/envs/samtools-1.9.yaml" + wget: "{MODSDIR}/envs/wget-1.20.1.yaml" + bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml" + amber: "{MODSDIR}/envs/hmftools-amber-3.5.yaml" + cobalt: "{MODSDIR}/envs/hmftools-cobalt-1.11.yaml" + purple: "{MODSDIR}/envs/hmftools-purple-2.54.yaml" + linx: "{MODSDIR}/envs/hmftools-linx-1.15.yaml" + linx_annotate: "{MODSDIR}/envs/hmftools-linx-1.15.yaml" + snpeff: "{MODSDIR}/envs/snpeff-4.3.1t.yaml" + + threads: + vcf_sample_names: 1 + snpeff: 4 + amber: 16 + cobalt: 16 + purple: 8 + linx: 2 + linx_viz: 8 + + resources: + vcf_sample_names: + mem_mb: 1000 + snpeff: + mem_mb: 5000 + amber: + mem_mb: 36000 + cobalt: + mem_mb: 20000 + purple: + mem_mb: 20000 + linx: + mem_mb: 10000 + linx_viz: + mem_mb: 20000 + + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False diff --git a/modules/hmftools/1.1/envs/bcftools-1.10.2.yaml b/modules/hmftools/1.1/envs/bcftools-1.10.2.yaml new file mode 120000 index 00000000..72959e7b --- /dev/null +++ b/modules/hmftools/1.1/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-amber-3.4.yaml b/modules/hmftools/1.1/envs/hmftools-amber-3.4.yaml new file mode 120000 index 00000000..fac6fa12 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-amber-3.4.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-amber-3.4.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-amber-3.5.yaml b/modules/hmftools/1.1/envs/hmftools-amber-3.5.yaml new file mode 120000 index 00000000..71dfe9fb --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-amber-3.5.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-amber-3.5.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-cobalt-1.11.yaml b/modules/hmftools/1.1/envs/hmftools-cobalt-1.11.yaml new file mode 120000 index 00000000..d671910e --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-cobalt-1.11.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-cobalt-1.11.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-cobalt-1.8.yaml b/modules/hmftools/1.1/envs/hmftools-cobalt-1.8.yaml new file mode 120000 index 00000000..eb143618 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-cobalt-1.8.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-cobalt-1.8.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-cobalt-1.9.yaml b/modules/hmftools/1.1/envs/hmftools-cobalt-1.9.yaml new file mode 120000 index 00000000..8c4af7ac --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-cobalt-1.9.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-cobalt-1.9.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-linx-1.10.yaml b/modules/hmftools/1.1/envs/hmftools-linx-1.10.yaml new file mode 120000 index 00000000..6383f883 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-linx-1.10.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-linx-1.10.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-linx-1.11.yaml b/modules/hmftools/1.1/envs/hmftools-linx-1.11.yaml new file mode 120000 index 00000000..09c92d66 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-linx-1.11.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-linx-1.11.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-linx-1.15.yaml b/modules/hmftools/1.1/envs/hmftools-linx-1.15.yaml new file mode 120000 index 00000000..f789beed --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-linx-1.15.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-linx-1.15.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.44.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.44.yaml new file mode 120000 index 00000000..43d663aa --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.44.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.44.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.45.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.45.yaml new file mode 120000 index 00000000..dd0a26c5 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.45.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.45.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.48.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.48.yaml new file mode 120000 index 00000000..bb543853 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.48.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.48.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.54.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.54.yaml new file mode 120000 index 00000000..66e18c15 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.54.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.54.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/samtools-1.9.yaml b/modules/hmftools/1.1/envs/samtools-1.9.yaml new file mode 120000 index 00000000..ab29288b --- /dev/null +++ b/modules/hmftools/1.1/envs/samtools-1.9.yaml @@ -0,0 +1 @@ +../../../../envs/samtools/samtools-1.9.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/snpeff-4.3.1t.yaml b/modules/hmftools/1.1/envs/snpeff-4.3.1t.yaml new file mode 120000 index 00000000..c452e525 --- /dev/null +++ b/modules/hmftools/1.1/envs/snpeff-4.3.1t.yaml @@ -0,0 +1 @@ +../../../../envs/snpeff/snpeff-4.3.1t.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/wget-1.20.1.yaml b/modules/hmftools/1.1/envs/wget-1.20.1.yaml new file mode 120000 index 00000000..86501e72 --- /dev/null +++ b/modules/hmftools/1.1/envs/wget-1.20.1.yaml @@ -0,0 +1 @@ +../../../../envs/wget/wget-1.20.1.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/hmftools.smk b/modules/hmftools/1.1/hmftools.smk new file mode 100644 index 00000000..db45ff9e --- /dev/null +++ b/modules/hmftools/1.1/hmftools.smk @@ -0,0 +1,629 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: Laura Hilton +# Module Author: Laura Hilton +# Contributors: N/A + + +##### SETUP ##### + + +# Import package with useful functions for developing analysis modules +import oncopipe as op + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["hmftools"]` +CFG = op.setup_module( + name = "hmftools", + version = "1.1", + subdirectories = ["inputs", "prepare_slms3", "amber", "cobalt", "purple", "linx", "outputs"], +) + +# Define rules to be run locally when using a compute cluster +localrules: + _hmftools_input_bam, + _hmftools_input_slms3, + _hmftools_slms3_sample_names, + _hmftools_input_gridss, + _hmftools_input_references, + _hmftools_get_cobalt_gc, + _hmftools_get_cobalt_bed, + _hmftools_get_amber_snps, + _hmftools_get_purple_drivers, + _hmftools_get_linx_db, + _hmftools_get_ensembl_cache, + _hmftools_purple_output, + _hmftools_purple_plots, + _hmftools_all + + +VERSION_MAP_HMFTOOLS = { + "grch37": "37", + "hs37d5": "37", + "hg38": "38" +} + +possible_genome_builds = VERSION_MAP_HMFTOOLS.keys() +for genome_build in CFG["runs"]["tumour_genome_build"]: + assert genome_build in possible_genome_builds, ( + "Samples table includes genome builds not yet compatible with this module. " + "This module is currently only compatible with {possible_genome_builds}. " + ) + + +masked_string = "" +if CFG["options"]["use_masked_ref"]: + masked_string = "_masked" + + +##### RULES ##### + + +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _hmftools_input_bam: + input: + bam = CFG["inputs"]["sample_bam"], + bai = CFG["inputs"]["sample_bai"], + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bai", + group: "input_and_vcf" + wildcard_constraints: + genome_build = "|".join(VERSION_MAP_HMFTOOLS.keys()), + pair_status = "matched|unmatched" + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + +rule _hmftools_input_slms3: + input: + vcf = CFG["inputs"]["slms3_vcf"], + output: + vcf = CFG["dirs"]["inputs"] + "slms3_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/slms3.vcf.gz" + group: "input_and_vcf" + run: + op.relative_symlink(input.vcf, output.vcf) + +rule _hmftools_input_gridss: + input: + gridss_somatic_vcf = CFG["inputs"]["gridss_somatic"], + gridss_somatic_tbi = CFG["inputs"]["gridss_somatic_tbi"], + gridss_filtered_vcf = CFG["inputs"]["gridss_somatic_filtered"], + gridss_filtered_tbi = CFG["inputs"]["gridss_somatic_filtered_tbi"] + output: + gridss_somatic_vcf = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz", + gridss_somatic_tbi = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz.tbi", + gridss_filtered_vcf = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz", + gridss_filtered_tbi = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz.tbi" + group: "input_and_vcf" + run: + op.absolute_symlink(input.gridss_somatic_vcf, output.gridss_somatic_vcf) + op.absolute_symlink(input.gridss_somatic_tbi, output.gridss_somatic_tbi) + op.absolute_symlink(input.gridss_filtered_vcf, output.gridss_filtered_vcf) + op.absolute_symlink(input.gridss_filtered_tbi, output.gridss_filtered_tbi) + +# Rules to download and setup reference files + +rule _hmftools_input_references: + input: + genome_fa = reference_files("genomes/{genome_build}" + masked_string + "/genome_fasta/genome.fa"), + genome_fai = reference_files("genomes/{genome_build}" + masked_string + "/genome_fasta/genome.fa.fai"), + genome_dict = reference_files("genomes/{genome_build}" + masked_string + "/genome_fasta/genome.dict") + output: + genome_fa = CFG["dirs"]["inputs"] + "references/{genome_build}" + masked_string + "/genome_fa/genome.fa", + genome_fai = CFG["dirs"]["inputs"] + "references/{genome_build}" + masked_string + "/genome_fa/genome.fa.fai", + genome_dict = CFG["dirs"]["inputs"] + "references/{genome_build}" + masked_string + "/genome_fa/genome.dict" + shell: + op.as_one_line(""" + ln -s {input.genome_fa} {output.genome_fa} && + ln -s {input.genome_fai} {output.genome_fai} && + ln -s {input.genome_dict} {output.genome_dict} + """) + +rule _hmftools_get_cobalt_gc: + output: + gc = CFG["dirs"]["inputs"] + "references/{genome_build}/cobalt/GC_profile.1000bp.cnp" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/cobalt", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.gc} {params.url}/GC_profile.1000bp.{params.alt_build}.cnp' + +rule _hmftools_get_cobalt_bed: + output: + bed = CFG["dirs"]["inputs"] + "references/{genome_build}/cobalt/DiploidRegions.bed" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/cobalt", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.bed} {params.url}/DiploidRegions.{params.alt_build}.bed' + +rule _hmftools_get_amber_snps: + output: + vcf = CFG["dirs"]["inputs"] + "references/{genome_build}/amber/GermlineHetPon.vcf.gz", + snpcheck = CFG["dirs"]["inputs"] + "references/{genome_build}/amber/Amber.snpcheck.vcf.gz" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/amber", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.vcf} {params.url}/GermlineHetPon.{params.alt_build}.vcf.gz; ' + 'wget -O {output.snpcheck} {params.url}/Amber.snpcheck.{params.alt_build}.vcf' + +rule _hmftools_get_purple_drivers: + output: + hotspots = CFG["dirs"]["inputs"] + "references/{genome_build}/purple/KnownHotspots.vcf.gz", + gene_panel = CFG["dirs"]["inputs"] + "references/{genome_build}/purple/DriverGenePanel.tsv" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/purple", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.hotspots} {params.url}/KnownHotspots.somatic.{params.alt_build}.vcf.gz && ' + 'wget -O {output.hotspots}.tbi {params.url}/KnownHotspots.somatic.{params.alt_build}.vcf.gz.tbi && ' + 'wget -O {output.gene_panel} {params.url}/DriverGenePanel.{params.alt_build}.tsv' + +rule _hmftools_get_linx_db: + output: + directory(CFG["dirs"]["inputs"] + "references/{genome_build}/linx_db") + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/linx/Linx", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -r -np -nd -P {output} -A .bed,.csv {params.url}/{params.alt_build} && ' + 'wget -O {output}/viral_host_ref.csv {params.url}/viral_host_ref.csv' + +rule _hmftools_get_ensembl_cache: + output: + cache = directory(CFG["dirs"]["inputs"] + "references/{genome_build}/ensembl_cache/"), + complete = touch(CFG["dirs"]["inputs"] + "references/{genome_build}/ensembl_cache/cache.complete") + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/ensembl_data_cache", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.cache}/{params.alt_build}.zip {params.url}/{params.alt_build}.zip && ' + 'unzip -d {output.cache} {output.cache}/{params.alt_build}.zip' + +# Prepare SLMS-3 VCF files for use with PURPLE +# SnpEff annotation enables driver discovery logic + +rule _hmftools_slms3_sample_names: + input: + vcf = rules._hmftools_input_slms3.output.vcf + output: + vcf = temp(CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/tmp.slms3.vcf") + log: CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/vcf_sample_names.log" + conda: + CFG["conda_envs"]["bcftools"] + threads: CFG["threads"]["vcf_sample_names"] + resources: + **CFG["resources"]["vcf_sample_names"] + group: "input_and_vcf" + shell: + op.as_one_line(""" + bcftools view -Ov {input.vcf} | + sed 's/TUMOR/{wildcards.tumour_id}/g' | + sed 's/NORMAL/{wildcards.normal_id}/g' + > {output.vcf} + """) + +rule _hmftools_snpeff_vcf: + input: + vcf = str(rules._hmftools_slms3_sample_names.output.vcf) + output: + sample_key = temp(CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/sample_key.txt"), + vcf = temp(CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/slms3.snpeff.vcf.gz") + resources: + **CFG["resources"]["snpeff"] + params: + snpeff_build = lambda w: { + "grch37": "GRCh37.75", + "hs37d5": "GRCh37.75", + "hg38": "hg38" + }[w.genome_build], + config = "$(readlink -e $(which snpEff)).config", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + log: + CFG["logs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/snpeff_slms3.log" + conda: + CFG["conda_envs"]["snpeff"] + threads: + CFG["threads"]["snpeff"] + shell: + op.as_one_line(""" + printf "{wildcards.normal_id}\t{wildcards.tumour_id}\n" > {output.sample_key} && + snpEff -Xmx{params.mem_mb}m + -c {params.config} -noStats + -cancer -cancerSamples {output.sample_key} + {params.snpeff_build} {input.vcf} | + bcftools view -Oz -o {output.vcf} - && + bcftools index -t {output.vcf} + """) + + +# Run AMBER to calculate BAFs +rule _hmftools_amber_matched: + input: + tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + snps = str(rules._hmftools_get_amber_snps.output.vcf), + fasta = str(rules._hmftools_input_references.output.genome_fa) + output: + vcf = CFG["dirs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.amber.baf.vcf.gz" + resources: + **CFG["resources"]["amber"] + params: + options = CFG["options"]["amber"], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8) + log: CFG["logs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/amber.log" + wildcard_constraints: + pair_status = "matched" + conda: + CFG["conda_envs"]["amber"] + threads: + CFG["threads"]["amber"] + shell: + op.as_one_line(""" + AMBER -Xmx{params.jvmheap}m + -reference {wildcards.normal_id} -reference_bam {input.normal_bam} + -tumor {wildcards.tumour_id} -tumor_bam {input.tumour_bam} + -output_dir `dirname {output.vcf}` + -threads {threads} + -loci {input.snps} + -ref_genome {input.fasta} + {params.options} + 2>&1 | tee -a {log} + """) + +rule _hmftools_amber_unmatched: + input: + tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + snps = str(rules._hmftools_get_amber_snps.output.vcf), + fasta = str(rules._hmftools_input_references.output.genome_fa) + output: + vcf = CFG["dirs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.amber.baf.vcf.gz" + resources: + **CFG["resources"]["amber"] + params: + options = CFG["options"]["amber"], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8) + log: CFG["logs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/amber.log" + wildcard_constraints: + pair_status = "unmatched" + conda: + CFG["conda_envs"]["amber"] + threads: + CFG["threads"]["amber"] + shell: + op.as_one_line(""" + AMBER -Xmx{params.jvmheap}m + -tumor_only + -tumor {wildcards.tumour_id} -tumor_bam {input.tumour_bam} + -output_dir `dirname {output.vcf}` + -threads {threads} + -loci {input.snps} + -ref_genome {input.fasta} + {params.options} + 2>&1 | tee -a {log} + """) + +# Run COBALT to estimate depth across the genome +rule _hmftools_cobalt: + input: + tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + gc_profile = str(rules._hmftools_get_cobalt_gc.output.gc), + fasta = str(rules._hmftools_input_references.output.genome_fa) + output: + tumour_ratio = CFG["dirs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.cobalt.ratio.pcf", + normal_ratio = CFG["dirs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{normal_id}.cobalt.ratio.pcf", + tumour_tsv = temp(CFG["dirs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.cobalt.ratio.tsv"), + log: ratio = CFG["logs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/cobalt.log" + resources: + **CFG["resources"]["cobalt"] + params: + options = CFG["options"]["cobalt"], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8) + wildcard_constraints: + pair_status = "matched|unmatched" + conda: + CFG["conda_envs"]["cobalt"] + threads: + CFG["threads"]["cobalt"] + shell: + op.as_one_line(""" + COBALT -Xmx{params.jvmheap}m + -reference {wildcards.normal_id} -reference_bam {input.normal_bam} + -tumor {wildcards.tumour_id} -tumor_bam {input.tumour_bam} + -ref_genome {input.fasta} + -output_dir `dirname {output.tumour_ratio}` + -threads {threads} + -gc_profile {input.gc_profile} + {params.options} + 2>&1 | tee -a {log} + """) + + +# Run PURPLE for final CNV calling + +# Define variables for output file names +purple_out = [ + "purity.tsv", + "purity.range.tsv", + "cnv.gene.tsv", + "sv.vcf.gz", +] +purple_plots = [ + "circos", + "input", + "map", + "purity.range", + "segment" + ] + +rule _hmftools_purple_matched: + input: + amber = CFG["dirs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.amber.baf.vcf.gz", + cobalt_tumour = str(rules._hmftools_cobalt.output.tumour_ratio), + cobalt_normal = str(rules._hmftools_cobalt.output.normal_ratio), + cobalt_tumour_tsv = str(rules._hmftools_cobalt.output.tumour_tsv), + slms3_vcf = str(rules._hmftools_snpeff_vcf.output.vcf), + gridss_somatic_vcf = str(rules._hmftools_input_gridss.output.gridss_somatic_vcf), + gridss_filtered_vcf = str(rules._hmftools_input_gridss.output.gridss_filtered_vcf), + reference_fa = str(rules._hmftools_input_references.output.genome_fa), + gene_panel = str(rules._hmftools_get_purple_drivers.output.gene_panel), + hotspots = str(rules._hmftools_get_purple_drivers.output.hotspots), + gc_profile = str(rules._hmftools_get_cobalt_gc.output.gc) + output: + files = expand(CFG["dirs"]["purple"] + "{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}/{{tumour_id}}.purple.{out_file}", + out_file = purple_out), + plots = expand(CFG["dirs"]["purple"] + "{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}/plot/{{tumour_id}}.{plot_name}.png", + plot_name = purple_plots) + log: CFG["logs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/purple.log" + resources: + **CFG["resources"]["purple"] + params: + outdir = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}", + options = CFG["options"]["purple"], + circos = "`which circos`", + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.9) + wildcard_constraints: + pair_status = "matched|unmatched", + out_file = "|".join(purple_out), + plot_name = "|".join(purple_plots) + conda: + CFG["conda_envs"]["purple"] + threads: + CFG["threads"]["purple"] + shell: + op.as_one_line(""" + PURPLE -Xmx{params.jvmheap}m -driver_catalog + -reference {wildcards.normal_id} + -tumor {wildcards.tumour_id} + -output_dir {params.outdir} + -amber `dirname {input.amber}` + -cobalt `dirname {input.cobalt_tumour}` + -gc_profile {input.gc_profile} + -ref_genome {input.reference_fa} + -somatic_hotspots {input.hotspots} + -driver_gene_panel {input.gene_panel} + -somatic_vcf {input.slms3_vcf} + -structural_vcf {input.gridss_filtered_vcf} + -sv_recovery_vcf {input.gridss_somatic_vcf} + -circos {params.circos} + {params.options} + -threads {threads} + 2>&1 | tee -a {log} + """) + + + + +# Run LINX to cluster and visualize CNV and SV data +rule _hmftools_linx: + input: + purple_vcf = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.purple.sv.vcf.gz", + ensembl_cache = str(rules._hmftools_get_ensembl_cache.output.cache), + linx_db = str(rules._hmftools_get_linx_db.output) + output: + clusters = CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.linx.vis_sv_data.tsv", + svs = CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.linx.svs.tsv" + log: CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/linx.log" + resources: + **CFG["resources"]["linx"] + params: + ref_genome_version = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8), + options = CFG["options"]["linx"], + cache_subdir = lambda w: config["lcr-modules"]["hmftools"]["dirs"]["inputs"] + "references/" + w.genome_build + "/ensembl_cache/" + VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["linx"] + threads: + CFG["threads"]["linx"] + shell: + op.as_one_line(""" + linx -Xmx{params.jvmheap}m + -sample {wildcards.tumour_id} + -ref_genome_version {params.ref_genome_version} + -sv_vcf {input.purple_vcf} + -purple_dir `dirname {input.purple_vcf}` + -output_dir `dirname {output.clusters}` + -gene_transcripts_dir {params.cache_subdir} + -fragile_site_file {input.linx_db}/fragile_sites_hmf.{params.ref_genome_version}.csv + -line_element_file {input.linx_db}/line_elements.{params.ref_genome_version}.csv + -viral_hosts_file {input.linx_db}/viral_host_ref.csv + -known_fusion_file {input.linx_db}/known_fusion_data.{params.ref_genome_version}.csv + -check_fusions + -check_drivers + -write_vis_data + {params.options} + 2>&1 | tee -a {log} + """) + +rule _hmftools_linx_viz: + input: + clusters = rules._hmftools_linx.output.clusters, + svs = rules._hmftools_linx.output.svs, + ensembl_cache = str(rules._hmftools_get_ensembl_cache.output.cache) + output: + plots = directory(CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot"), + data = directory(CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/data") + log: CFG["logs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/linx_viz.log" + resources: + **CFG["resources"]["linx_viz"] + params: + linx_jar = "$(ls $(dirname $(readlink -e $(which linx)))/*.jar)", + circos = "$(which circos)", + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8), + options = CFG["options"]["linx_viz"], + cache_subdir = lambda w: config["lcr-modules"]["hmftools"]["dirs"]["inputs"] + "references/" + w.genome_build + "/ensembl_cache/" + VERSION_MAP_HMFTOOLS[w.genome_build], + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["linx"] + threads: + CFG["threads"]["linx_viz"] + + shell: + op.as_one_line(""" + to_plot=$(dirname {input.svs})/to_plot.tsv; + tail -n +2 {input.svs} | awk '{{FS=OFS="\\t"}} $4 != "" {{print $3}}' | sort | uniq > $to_plot; + if [[ $(cat $to_plot | wc -l) -lt 50 ]]; then + cat $to_plot | while read cluster; do + java -Xmx{params.jvmheap}m -cp {params.linx_jar} com.hartwig.hmftools.linx.visualiser.SvVisualiser + -sample {wildcards.tumour_id} + -ref_genome_version V{params.alt_build} + -gene_transcripts_dir {params.cache_subdir} + -plot_out {output.plots} + -data_out {output.data} + -vis_file_dir $(dirname {input.clusters}) + -circos {params.circos} + -threads {threads} + -clusterId $cluster + -plot_cluster_genes + 2>&1 | tee -a {log}; + done; + else + echo "Too many clusters to plot for {wildcards.tumour_id}--{wildcards.normal_id}--{wildcards.pair_status}. See chromosome outputs and consider manually selecting clusters to plot. " 2>&1 | tee -a {log}; + fi; + for chrom in $(tail -n +2 {input.clusters} | cut -f8 | sort | uniq); do + java -Xmx{params.jvmheap}m -cp {params.linx_jar} com.hartwig.hmftools.linx.visualiser.SvVisualiser + -sample {wildcards.tumour_id} + -ref_genome_version V{params.alt_build} + -gene_transcripts_dir {params.cache_subdir} + -plot_out {output.plots} + -data_out {output.data} + -vis_file_dir $(dirname {input.clusters}) + -circos {params.circos} + -threads {threads} + -chromosome ${{chrom}} + 2>&1 | tee -a {log}; + done + """) + + + + +# Symlinks the final output files into the module results directory (under '99-outputs/') + +rule _hmftools_purple_output: + input: + files = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.purple.{out_file}" + output: + files = CFG["dirs"]["outputs"] + "purple_output/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.purple.{out_file}" + wildcard_constraints: + out_file = "|".join(purple_out) + run: + op.relative_symlink(input.files, output.files, in_module=True) + +rule _hmftools_purple_plots: + input: + plots = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot/{tumour_id}.{plot_name}.png" + output: + plots = CFG["dirs"]["outputs"] + "purple_plots/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.{plot_name}.png" + wildcard_constraints: + plot_name = "|".join(purple_plots) + run: + op.relative_symlink(input.plots, output.plots, in_module=True) + + +rule _hmftools_linx_plots: + input: + plots = CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot", + output: + plots = CFG["dirs"]["outputs"] + "linx_plots/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.symlinked" + shell: + op.as_one_line(""" + workdir=$PWD && + cd `dirname {output.plots}` && + find $workdir/{input.plots} -type f -name "*.png" -exec cp -s {{}} . \; && + touch $workdir/{output.plots} && + cd $workdir + """) + +rule _hmftools_dispatch: + input: + files = expand(CFG["dirs"]["outputs"] + "purple_output/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.purple.{out_file}", + out_file = purple_out), + plots = expand(CFG["dirs"]["outputs"] + "purple_plots/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{plot_name}.png", + plot_name = purple_plots), + linx = rules._hmftools_linx_plots.output.plots + output: + dispatched = touch(CFG["dirs"]["outputs"] + "dispatched/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.dispatched") + + +# Generates the target sentinels for each run, which generate the symlinks +rule _hmftools_all: + input: + expand( + [ + str(rules._hmftools_dispatch.output.dispatched), + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"]) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/hmftools/1.1/schemas/base-1.0.yaml b/modules/hmftools/1.1/schemas/base-1.0.yaml new file mode 120000 index 00000000..0a69d1ce --- /dev/null +++ b/modules/hmftools/1.1/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/hmftools/CHANGELOG.md b/modules/hmftools/CHANGELOG.md index 7239dbca..1c038d33 100644 --- a/modules/hmftools/CHANGELOG.md +++ b/modules/hmftools/CHANGELOG.md @@ -5,6 +5,10 @@ All notable changes to the `hmftools` module will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1] - 2021-12-29 + +- Updates to the GRIDSS-PURPLE-LINX pipeline, incorporating new and better ways of handling unmatched tumours. + ## [1.0] - 2020-07-29 This release was authored by Laura Hilton. diff --git a/modules/ichorcna/1.0/ichorcna.smk b/modules/ichorcna/1.0/ichorcna.smk index 74c54f7d..85750346 100644 --- a/modules/ichorcna/1.0/ichorcna.smk +++ b/modules/ichorcna/1.0/ichorcna.smk @@ -70,7 +70,7 @@ rule _install_ichorcna: CFG["conda_envs"]["ichorcna"] shell: op.as_one_line(""" - git clone git@github.com:broadinstitute/ichorCNA.git {params.outdir} && + git clone git://github.com/broadinstitute/ichorCNA.git {params.outdir} && touch {output.complete}""") # This defines the script/extdata directory used by ichorCNA in the subsequent rules: @@ -216,14 +216,13 @@ rule _ichorcna_read_counter: "readCounter {input.bam} -c {params.chrs} -w {params.binSize} -q {params.qual} > {output} 2> {log}" -# This function will return a comma-separated list of chromosomes to include in readCounter +# This function will return a comma-separated list of chromosomes to include in runIchorCNA def get_chromosomes_R(wildcards): chromosomesR=[] stringStart="c('" for i in range(1,23): chromosomesR.append(str(i)) chromosomesR.append("X") - chromosomesR.append("Y") if "38" in str(wildcards.genome_build): chromosomesR = ["chr" + x for x in chromosomesR] chromosomesR= "','".join(chromosomesR) diff --git a/modules/ichorcna/1.1/config/default.yaml b/modules/ichorcna/1.1/config/default.yaml new file mode 100644 index 00000000..4844ba01 --- /dev/null +++ b/modules/ichorcna/1.1/config/default.yaml @@ -0,0 +1,115 @@ +lcr-modules: + + ichorcna: + + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + + + scratch_subdirectories: [] + + options: + deeptools: + qual: 20 # only includes reads with mapping quality greater than 20 + binSize: 1000000 # set window size to compute coverage + # available binSizes are: 1000000, 500000, 50000, 10000 + flagExclude: 1028 + opt: " --ignoreDuplicates --extendReads " + run: + ichorCNA_libdir: "" + ichorCNA_rscript: "{MODSDIR}/src/runIchorCNA.R" + # use panel matching same bin size (optional) + ichorCNA_normalPanel: + "1000000": "inst/extdata/HD_ULP_PoN_{genome_build}_1Mb_median_normAutosome_median.rds" + "500000": "inst/extdata/HD_ULP_PoN_{genome_build}_500kb_median_normAutosome_median.rds" + # must use gc wig file corresponding to same binSize (required) + ichorCNA_gcWig: + "1000000": "inst/extdata/gc_{genome_build}_1000kb.wig" + "500000": "inst/extdata/gc_{genome_build}_500kb.wig" + "50000": "inst/extdata/gc_{genome_build}_50kb.wig" + "10000": "inst/extdata/gc_{genome_build}_10kb.wig" + # must use map wig file corresponding to same binSize (required) + ichorCNA_mapWig: + "1000000": "inst/extdata/map_{genome_build}_1000kb.wig" + "500000": "inst/extdata/map_{genome_build}_500kb.wig" + "50000": "inst/extdata/map_{genome_build}_50kb.wig" + "10000": "inst/extdata/map_{genome_build}_10kb.wig" + # use bed file if sample has targeted regions, eg. exome data (optional) + ichorCNA_exons: NULL + ichorCNA_centromere: + grch37: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hg19: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hs37d5: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + grch38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + hg38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + ichorCNA_minMapScore: 0.75 + ichorCNA_fracReadsInChrYForMale: 0.002 # Threshold for fraction of reads in chrY to assign as male + ichorCNA_genomeStyle: # can set this to UCSC or NCBI + grch37: "NCBI" + hg19: "NCBI" + hs37d5: "NCBI" + grch38: "UCSC" + hg38: "UCSC" + # chrs used for training ichorCNA parameters, e.g. tumor fraction. + ichorCNA_chrTrain: + grch37: "c(1:22)" + hg19: "c(1:22)" + hs37d5: "c(1:22)" + grch38: "paste0('chr', c(1:22))" + hg38: "paste0('chr', c(1:22))" + # non-tumor fraction parameter restart values; higher values should be included for cfDNA + ichorCNA_normal: "c(0.5,0.6,0.7,0.8,0.9,0.95)" + # ploidy parameter restart values + ichorCNA_ploidy: "c(2,3,4)" + ichorCNA_estimateNormal: TRUE + ichorCNA_estimatePloidy: TRUE + ichorCNA_estimateClonality: TRUE + # states to use for subclonal CN + ichorCNA_scStates: "c(1,3)" + # set maximum copy number to use + ichorCNA_maxCN: 5 + # TRUE/FALSE to include homozygous deletion state # FALSE for low coverage libraries (ex. 0.1x) ; can turn on for higher coverage data (ex. >10x) + ichorCNA_includeHOMD: FALSE + # Exclude solutions if total length of subclonal CNAs > this fraction of the genome + ichorCNA_maxFracGenomeSubclone: 0.5 + # Exclude solutions if total length of subclonal CNAs > this fraction of total CNA length + ichorCNA_maxFracCNASubclone: 0.7 + # control segmentation - higher (e.g. 0.9999999) leads to higher specificity and fewer segments + # lower (e.g. 0.99) leads to higher sensitivity and more segments + ichorCNA_txnE: 0.9399999 + # control segmentation - higher (e.g. 10000000) leads to higher specificity and fewer segments + # lower (e.g. 100) leads to higher sensitivity and more segments + ichorCNA_txnStrength: 10000 + ichorCNA_plotFileType: "pdf" + ichorCNA_plotYlim: "c(-2,2)" + + + conda_envs: + ichorcna: "{MODSDIR}/envs/ichorcna.env.yaml" + deeptools: "{MODSDIR}/envs/deeptools.env.yaml" + bedops_tools: "{MODSDIR}/envs/bedops_tools.env.yaml" + ucsc-bigwigtowig: "{MODSDIR}/envs/ucsc-bigwigtowig.env.yaml" + + threads: + deeptools: 20 + ucsc: 4 + run: 4 + + resources: + deeptools: + mem_mb: 40000 + bam: 1 + ucsc: + mem_mb: 6000 + bam: 1 + run: + mem_mb: 6000 + bam: 1 + + pairing_config: + genome: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: True diff --git a/modules/ichorcna/1.1/envs/bedops_tools.env.yaml b/modules/ichorcna/1.1/envs/bedops_tools.env.yaml new file mode 100644 index 00000000..2052a2e0 --- /dev/null +++ b/modules/ichorcna/1.1/envs/bedops_tools.env.yaml @@ -0,0 +1,29 @@ +name: null +channels: + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bedops=2.4.39 + - bedtools=2.30.0 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.10.26 + - curl=7.80.0 + - krb5=1.19.2 + - libcurl=7.80.0 + - libedit=3.1.20210910 + - libev=4.33 + - libgcc=7.2.0 + - libgcc-ng=9.3.0 + - libgomp=9.3.0 + - libnghttp2=1.46.0 + - libssh2=1.9.0 + - libstdcxx-ng=9.3.0 + - ncurses=6.3 + - openssl=1.1.1l + - samtools=1.7 + - xz=5.2.5 + - zlib=1.2.11 +prefix: /projects/rmorin/projects/tumour_contam/envs/bedops_tools diff --git a/modules/ichorcna/1.1/envs/deeptools.env.yaml b/modules/ichorcna/1.1/envs/deeptools.env.yaml new file mode 100644 index 00000000..fab88374 --- /dev/null +++ b/modules/ichorcna/1.1/envs/deeptools.env.yaml @@ -0,0 +1,76 @@ +name: null +channels: + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - blas=1.0 + - brotli=1.0.9 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.10.26 + - certifi=2021.10.8 + - curl=7.80.0 + - cycler=0.11.0 + - deeptools=3.5.1 + - deeptoolsintervals=0.1.9 + - fonttools=4.25.0 + - freetype=2.11.0 + - giflib=5.2.1 + - intel-openmp=2021.4.0 + - jpeg=9d + - kiwisolver=1.3.1 + - krb5=1.19.2 + - lcms2=2.12 + - ld_impl_linux-64=2.35.1 + - libcurl=7.80.0 + - libdeflate=1.0 + - libedit=3.1.20210910 + - libev=4.33 + - libffi=3.3 + - libgcc-ng=9.3.0 + - libgfortran-ng=7.5.0 + - libgfortran4=7.5.0 + - libgomp=9.3.0 + - libnghttp2=1.46.0 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libwebp=1.2.0 + - libwebp-base=1.2.0 + - lz4-c=1.9.3 + - matplotlib-base=3.5.0 + - mkl=2021.4.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.1 + - mkl_random=1.2.2 + - munkres=1.0.7 + - ncurses=6.3 + - numpy=1.21.2 + - numpy-base=1.21.2 + - olefile=0.46 + - openssl=1.1.1l + - packaging=21.3 + - pillow=8.4.0 + - pip=21.2.2 + - plotly=4.14.3 + - py2bit=0.3.0 + - pybigwig=0.3.17 + - pyparsing=3.0.4 + - pysam=0.15.3 + - python=3.7.11 + - python-dateutil=2.8.2 + - readline=8.1 + - retrying=1.3.3 + - scipy=1.7.1 + - setuptools=58.0.4 + - six=1.16.0 + - sqlite=3.37.0 + - tk=8.6.11 + - wheel=0.37.0 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.9 +prefix: /projects/rmorin/projects/tumour_contam/envs/deeptools diff --git a/modules/ichorcna/1.1/envs/ichorcna.env.yaml b/modules/ichorcna/1.1/envs/ichorcna.env.yaml new file mode 100644 index 00000000..208fd501 --- /dev/null +++ b/modules/ichorcna/1.1/envs/ichorcna.env.yaml @@ -0,0 +1,108 @@ +name: null +channels: + - conda-forge + - dranew + - bioconda + - defaults + - r +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-genomeinfodb=1.26.0 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicranges=1.42.0 + - bioconductor-hmmcopy=1.32.0 + - bioconductor-iranges=2.24.0 + - bioconductor-s4vectors=0.28.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - ca-certificates=2020.12.5 + - cairo=1.16.0 + - curl=7.71.1 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.0 + - hmmcopy_utils=0.0.1 + - icu=68.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.2 + - ld_impl_linux-64=2.35.1 + - libblas=3.8.0 + - libcblas=3.8.0 + - libcurl=7.71.1 + - libedit=3.1.20191231 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.66.7 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.8.0 + - libopenblas=0.3.10 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.13 + - libxml2=2.9.10 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1j + - pango=1.42.4 + - pcre=8.44 + - pcre2=10.36 + - pixman=0.40.0 + - pthread-stubs=0.4 + - r-base=4.0.3 + - r-bitops=1.0_6 + - r-data.table=1.14.0 + - r-getopt=1.20.3 + - r-ichorcna=0.2.0 + - r-optparse=1.6.6 + - r-plyr=1.8.6 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.2 + - readline=8.0 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.0 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.9 +prefix: /projects/rmorin/projects/tumour_contam/envs/ichorcna diff --git a/modules/ichorcna/1.1/envs/ucsc-bigwigtowig.env.yaml b/modules/ichorcna/1.1/envs/ucsc-bigwigtowig.env.yaml new file mode 100644 index 00000000..4d035cb8 --- /dev/null +++ b/modules/ichorcna/1.1/envs/ucsc-bigwigtowig.env.yaml @@ -0,0 +1,19 @@ +name: null +channels: + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - ca-certificates=2021.10.26 + - libgcc=7.2.0 + - libgcc-ng=9.3.0 + - libgomp=9.3.0 + - libpng=1.6.37 + - libstdcxx-ng=9.3.0 + - libuuid=1.0.3 + - mysql-connector-c=6.1.6 + - openssl=1.0.2u + - ucsc-bigwigtowig=366 + - zlib=1.2.11 +prefix: /projects/rmorin/projects/tumour_contam/envs/ucsc-bigwigtowig diff --git a/modules/ichorcna/1.1/ichorcna.smk b/modules/ichorcna/1.1/ichorcna.smk new file mode 100644 index 00000000..8391dee5 --- /dev/null +++ b/modules/ichorcna/1.1/ichorcna.smk @@ -0,0 +1,431 @@ +#!/usr/bin/env snakemake + + +# ---------------------------------------------------------------------------- # +##### ATTRIBUTION ##### +# ---------------------------------------------------------------------------- # + +# Original snakemake author: Jasper Wong +# Module author: Jasper Wong +# Additional contributors: N/A + + +# ---------------------------------------------------------------------------- # +##### SETUP ##### +# ---------------------------------------------------------------------------- # + +### Modules ### + +import pandas as pd +import numpy as np +import oncopipe as op +import glob +import os + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +### Directories ### +# Setup module and store module-specific configuration in `CFG`. +CFG = op.setup_module( + name = "ichorcna", + version = "1.0", + subdirectories = ["inputs", "readDepth", "seg", "outputs"] +) + +localrules: + _ichorcna_input_bam, + _ichorcna_output, + _ichorcna_all + +# ---------------------------------------------------------------------------- # +##### RULES ##### +# ---------------------------------------------------------------------------- # + +### Set-up dependencies and packages ### +# Download github and all external files for ichorCNA: (needed since their extdata is not complete for all genome builds) +rule _install_ichorcna: + output: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + outdir = CFG["dirs"]["inputs"] + "ichorCNA/" + conda: + CFG["conda_envs"]["ichorcna"] + shell: + op.as_one_line(""" + git clone git://github.com/broadinstitute/ichorCNA.git {params.outdir} && + touch {output.complete}""") + +# This defines the script/extdata directory used by ichorCNA in the subsequent rules: +ichorDir = CFG["dirs"]["inputs"] + "ichorCNA/inst/extdata/" + +# Symlinks the extdata appropriately +rule _setup_ichorcna_extdata: + input: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_1Mb_median_normAutosome_mapScoreFiltered_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_500kb_median_normAutosome_mapScoreFiltered_median.rds", + hg38_1Mb_rds = ichorDir + "HD_ULP_PoN_hg38_1Mb_median_normAutosome_median.rds", + hg38_500kb_rds = ichorDir + "HD_ULP_PoN_hg38_500kb_median_normAutosome_median.rds", + hg19_1000kb_gc = ichorDir + "gc_hg19_1000kb.wig", + hg19_500kb_gc = ichorDir + "gc_hg19_500kb.wig", + hg19_50kb_gc = ichorDir + "gc_hg19_50kb.wig", + hg19_10kb_gc = ichorDir + "gc_hg19_10kb.wig", + hg38_1000kb_gc = ichorDir + "gc_hg38_1000kb.wig", + hg38_500kb_gc = ichorDir + "gc_hg38_500kb.wig", + hg38_50kb_gc = ichorDir + "gc_hg38_50kb.wig", + hg38_10kb_gc = ichorDir + "gc_hg38_10kb.wig", + hg19_1000kb_map = ichorDir + "map_hg19_1000kb.wig", + hg19_500kb_map = ichorDir + "map_hg19_500kb.wig", + hg19_50kb_map = ichorDir + "map_hg19_50kb.wig", + hg19_10kb_map = ichorDir + "map_hg19_10kb.wig", + hg38_1000kb_map = ichorDir + "map_hg38_1000kb.wig", + hg38_500kb_map = ichorDir + "map_hg38_500kb.wig", + hg38_50kb_map = ichorDir + "map_hg38_50kb.wig", + hg38_10kb_map = ichorDir + "map_hg38_10kb.wig", + output: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_hg19_1Mb_median_normAutosome_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_hg19_500kb_median_normAutosome_median.rds", + grch37_1Mb_rds = ichorDir + "HD_ULP_PoN_grch37_1Mb_median_normAutosome_median.rds", + grch37_500kb_rds = ichorDir + "HD_ULP_PoN_grch37_500kb_normAutosome_median.rds", + hs37d5_1Mb_rds = ichorDir + "HD_ULP_PoN_hs37d5_1Mb_median_normAutosome_median.rds", + hs37d5_500kb_rds = ichorDir + "HD_ULP_PoN_hs37d5_500kb_normAutosome_median.rds", + grch38_1Mb_rds = ichorDir + "HD_ULP_PoN_grch38_1Mb_median_normAutosome_median.rds", + grch38_500kb_rds = ichorDir + "HD_ULP_PoN_grch38_500kb_median_normAutosome_median.rds", + grch37_1000kb_gc = ichorDir + "gc_grch37_1000kb.wig", + grch37_500kb_gc = ichorDir + "gc_grch37_500kb.wig", + grch37_50kb_gc = ichorDir + "gc_grch37_50kb.wig", + grch37_10kb_gc = ichorDir + "gc_grch37_10kb.wig", + hs37d5_1000kb_gc = ichorDir + "gc_hs37d5_1000kb.wig", + hs37d5_500kb_gc = ichorDir + "gc_hs37d5_500kb.wig", + hs37d5_50kb_gc = ichorDir + "gc_hs37d5_50kb.wig", + hs37d5_10kb_gc = ichorDir + "gc_hs37d5_10kb.wig", + grch38_1000kb_gc = ichorDir + "gc_grch38_1000kb.wig", + grch38_500kb_gc = ichorDir + "gc_grch38_500kb.wig", + grch38_50kb_gc = ichorDir + "gc_grch38_50kb.wig", + grch38_10kb_gc = ichorDir + "gc_grch38_10kb.wig", + grch37_1000kb_map = ichorDir + "map_grch37_1000kb.wig", + grch37_500kb_map = ichorDir + "map_grch37_500kb.wig", + grch37_50kb_map = ichorDir + "map_grch37_50kb.wig", + grch37_10kb_map = ichorDir + "map_grch37_10kb.wig", + hs37d5_1000kb_map = ichorDir + "map_hs37d5_1000kb.wig", + hs37d5_500kb_map = ichorDir + "map_hs37d5_500kb.wig", + hs37d5_50kb_map = ichorDir + "map_hs37d5_50kb.wig", + hs37d5_10kb_map = ichorDir + "map_hs37d5_10kb.wig", + grch38_1000kb_map = ichorDir + "map_grch38_1000kb.wig", + grch38_500kb_map = ichorDir + "map_grch38_500kb.wig", + grch38_50kb_map = ichorDir + "map_grch38_50kb.wig", + grch38_10kb_map = ichorDir + "map_grch38_10kb.wig", + complete = touch(ichorDir + "symlink.done") + run: + op.relative_symlink(params.hg19_1Mb_rds, output.hg19_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hg19_500kb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.grch37_1Mb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.hs37d5_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.grch37_500kb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hs37d5_500kb_rds) + op.relative_symlink(params.hg38_1Mb_rds, output.grch38_1Mb_rds) + op.relative_symlink(params.hg38_500kb_rds, output.grch38_500kb_rds) + op.relative_symlink(params.hg19_1000kb_gc, output.grch37_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.grch37_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.grch37_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.grch37_10kb_gc) + op.relative_symlink(params.hg19_1000kb_gc, output.hs37d5_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.hs37d5_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.hs37d5_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.hs37d5_10kb_gc) + op.relative_symlink(params.hg38_1000kb_gc, output.grch38_1000kb_gc) + op.relative_symlink(params.hg38_500kb_gc, output.grch38_500kb_gc) + op.relative_symlink(params.hg38_50kb_gc, output.grch38_50kb_gc) + op.relative_symlink(params.hg38_10kb_gc, output.grch38_10kb_gc) + op.relative_symlink(params.hg19_1000kb_map, output.grch37_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.grch37_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.grch37_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.grch37_10kb_map) + op.relative_symlink(params.hg19_1000kb_map, output.hs37d5_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.hs37d5_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.hs37d5_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.hs37d5_10kb_map) + op.relative_symlink(params.hg38_1000kb_map, output.grch38_1000kb_map) + op.relative_symlink(params.hg38_500kb_map, output.grch38_500kb_map) + op.relative_symlink(params.hg38_50kb_map, output.grch38_50kb_map) + op.relative_symlink(params.hg38_10kb_map, output.grch38_10kb_map) + +### Run ichorCNA ### +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _ichorcna_input_bam: + input: + bam = CFG["inputs"]["sample_bam"], + bai = CFG["inputs"]["sample_bai"] + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai" + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + op.absolute_symlink(input.bai, output.crai) + + +# set-up for CRAM files (readCounter does not work with CRAM) +# deeptools to get .bw from .bam and .cram +rule _ichorcna_bamCoverage: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai", + ichorcna_package = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success", + symlink_complete = ichorDir + "symlink.done" + output: + bw = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/{sample_id}.bin{binSize}.bw" + params: + binSize = CFG["options"]["deeptools"]["binSize"], + qual = CFG["options"]["deeptools"]["qual"], + excludeFlag = CFG["options"]["deeptools"]["flagExclude"], + opt = CFG["options"]["deeptools"]["opt"], + dirOut = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/" + conda: CFG["conda_envs"]["deeptools"] + threads: CFG["threads"]["deeptools"] + resources: + **CFG["resources"]["deeptools"] + log: + CFG["logs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/{sample_id}.bin{binSize}.log" + shell: + """ + mkdir -p {params.dirOut}; + bamCoverage -b {input.bam} --binSize {params.binSize} --minMappingQuality {params.qual} --samFlagExclude {params.excludeFlag} {params.opt} -o {output.bw} -p {threads} + """ + + +# Converts bigWig to Wig +rule _ichorcna_bigwigToWig: + input: + bw = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/{sample_id}.bin{binSize}.bw" + output: + wig_int = temp(CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/wig/{sample_id}.bin{binSize}{chrom}.wig"), + conda: CFG["conda_envs"]["ucsc-bigwigtowig"] + threads: CFG["threads"]["ucsc"] + resources: + **CFG["resources"]["ucsc"] + wildcard_constraints: + chrom = ".+(?