diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7096c7da5..8e4525ad7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -16,6 +16,8 @@ - [ ] Input and output files are being symlinked into the `CFG["inputs"]` and `CFG["outputs"]` subdirectories, respectively. +- [ ] I grouped the input symlinking rule to the next job that uses the input files. + - [ ] I updated the final target rule (`*_all`) to include every output rule. - [ ] I explained important module design decisions in `CHANGELOG.md`. @@ -48,4 +50,11 @@ ## Checklist for Updated Module -To be completed. +Important! If you are updating the module version, ensure the previous version of the module is restored from master. +If you want to restore a deleted file or directory from the remote master, you can use `git checkout origin/master path/to/file`, +then a `git commit` will ensure that file is tracked on your branch again. +Example: +``` +mv modules/strelka/1.1 modules/strelka/1.2 +git checkout origin/master modules/strelka/1.1 +``` diff --git a/demo/capture_Snakefile.smk b/demo/capture_Snakefile.smk new file mode 100755 index 000000000..cdc33a39c --- /dev/null +++ b/demo/capture_Snakefile.smk @@ -0,0 +1,79 @@ +#!/usr/bin/env snakemake + +''' +This Snakefile is made to run all the modules compatible with Capture workflow. +Compatibility of a workflow can be checked by referring to the pairing_config parameter present in a default.yaml file of that module. +''' +##### SETUP ##### + +import oncopipe as op + +# filter sample table to use only capture seq_type +SAMPLES = op.load_samples("data/samples.tsv") +CAPTURE = op.filter_samples(SAMPLES, seq_type = "capture") + + +##### REFERENCE_FILES WORKFLOW ##### + + +subworkflow reference_files: + workdir: + "reference/" + snakefile: + "../workflows/reference_files/2.4/reference_files.smk" + configfile: + "../workflows/reference_files/2.4/config/default.yaml" + + +##### CONFIGURATION FILES ##### + + +# Load module-specific configuration +configfile: "../modules/slms_3/1.0/config/default.yaml" +configfile: "../modules/picard_qc/1.0/config/default.yaml" +configfile: "../modules/bam2fastq/1.2/config/default.yaml" +configfile: "../modules/sequenza/1.4/config/default.yaml" +configfile: "../modules/bwa_mem/1.1/config/default.yaml" +configfile: "../modules/utils/2.1/config/default.yaml" +configfile: "../modules/liftover/1.2/config/default.yaml" +configfile: "../modules/battenberg/1.2/config/default.yaml" +configfile: "../modules/pathseq/1.0/config/default.yaml" + +# Load project-specific config, which includes the shared +# configuration and some module-specific config updates +configfile: "capture_config.yaml" + + +##### CONFIGURATION UPDATES ##### + + +# Use all samples as a default sample list for each module +config["lcr-modules"]["_shared"]["samples"] = CAPTURE + +##### MODULE SNAKEFILES ##### + + +# Load module-specific snakefiles +include: "../modules/slms_3/1.0/slms_3.smk" +include: "../modules/picard_qc/1.0/picard_qc.smk" +include: "../modules/bam2fastq/1.2/bam2fastq.smk" +include: "../modules/sequenza/1.4/sequenza.smk" +include: "../modules/bwa_mem/1.1/bwa_mem.smk" +include: "../modules/utils/2.1/utils.smk" +include: "../modules/liftover/1.2/liftover.smk" +include: "../modules/battenberg/1.2/battenberg.smk" +include: "../modules/pathseq/1.0/pathseq.smk" + + +##### TARGETS ###### + +rule all: + input: + rules._slms_3_all.input, + rules._picard_qc_all.input, + rules._bam2fastq_all.input, + rules._sequenza_all.input, + rules._bwa_mem_all.input, + rules._liftover_all.input, + rules._battenberg_all.input, + rules._pathseq_all.input diff --git a/demo/capture_config.yaml b/demo/capture_config.yaml new file mode 100755 index 000000000..b1db681ac --- /dev/null +++ b/demo/capture_config.yaml @@ -0,0 +1,73 @@ +lcr-modules: + _shared: + lcr-modules: "../" + lcr-scripts: "../../lcr-scripts/" + root_output_dir: "results/" + scratch_directory: "scratch/" + unmatched_normal_ids: + capture--grch37: "TCRBOA7-N-WEX" + + slms_3: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + + picard_qc: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + switches: + capture_intervals: + _default: "reference/exomes/grch37/interval/target_regions.nochr_intervals.txt" + # if 'capture_kit_id' is a column in samples.tsv and contain more than one kit_id, specify each kit using the values in the column. e.g. and add the corresponding bed file if needed + # S07604624: "reference/exomes/grch37/interval/S07604624_intervals.txt" + # : "reference/exomes/grch38/interval/_intervals.txt" + + bam2fastq: + inputs: + sample_bam: "data/{sample_id}.bam" + temp_outputs: True # fastq outputs will be temporary + + sequenza: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + scratch_subdirectories: [] + + bwa_mem: + inputs: + sample_fastq_1: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read1.fastq.gz" + sample_fastq_2: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read2.fastq.gz" + scratch_subdirectories: [] + + + liftover: + tool: "sequenza" + dirs: + _parent: "results/sequenza-1.4_liftover-1.2" + inputs: + sample_seg: "results/sequenza-1.4/99-outputs/filtered_seg/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--{pair_status}.igv.seg" + + utils: + inputs: + bed: + grch37: "data/exome_bed/hg19/target_regions.nochr.bed" # make sure this corresponds with config["lcr-modules"]["picard_qc"]["inputs"]["intervals"] + # if testing on GSC, use this file: "/projects/dscott_prj/CCSRI_1500/exomes/ref/agilent/hg19/target_regions.nochr.bed" + mem_mb: + bam_sort: 48000 + threads: + bam_sort: 12 + + battenberg: + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "data/{sample_id}.bam" + + pathseq: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + + options: + min_read_length: 49 + ebv_cutoff: [0.00004, 0.00008] diff --git a/demo/config.yaml b/demo/config.yaml deleted file mode 100755 index 011540c24..000000000 --- a/demo/config.yaml +++ /dev/null @@ -1,162 +0,0 @@ -lcr-modules: - - _shared: - lcr-modules: "../" - lcr-scripts: "../../lcr-scripts/" - root_output_dir: "results/" - scratch_directory: "scratch/" - unmatched_normal_ids: - capture--grch37: "TCRBOA7-N-WEX" - - slms_3: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - - - bam2fastq: - inputs: - sample_bam: "data/{sample_id}.bam" - temp_outputs: False # fastq outputs will be temporary - - star: - inputs: - sample_fastq_1: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read1.fastq.gz" - sample_fastq_2: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read2.fastq.gz" - scratch_subdirectories: [] - - manta: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - - mixcr: - inputs: - sample_fastq_1: "data/{sample_id}.read1.fastq.gz" - sample_fastq_2: "data/{sample_id}.read2.fastq.gz" - - vcf2maf: - dirs: - _parent: "results/sage-1.0_vcf2maf-1.2" - inputs: - vep_cache: "reference/vep_caches/" - sample_vcf_gz: "results/sage-1.0/99-outputs/combined/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.{base_name}.vcf.gz" - convert_coord: "{SCRIPTSDIR}/crossmap/1.0/convert_maf_coords.sh" - vcf_base_name: "sage.combined" - options: - vcf2maf: "--filter-vcf 0 --vcf-tumor-id {tumour_id} --vcf-normal-id {normal_id}" - species: "homo_sapiens" - conda_envs: - vcf2maf: "{MODSDIR}/envs/vcf2maf-1.6.18.yaml" - crossmap: "{SCRIPTSDIR}/crossmap/1.0/convert_maf_coords.yaml" - # here you can specify path to txt file with a list of custom ENST IDs that override canonical selection - # it will be parsed to --custom-enst flag of vcf2maf - # if no non-canonical transcript IDs to be included, leave switches empty - # This is just an example of how to include the list of custom IDs - switches: - custom_enst: - hg38: "" - grch37: "data/custom_enst.txt" - hs37d5: "" - resources: - vcf2maf: - mem_mb: 12000 - vcf: 1 - crossmap: - mem_mb: 12000 - - - salmon: - inputs: - sample_fastq_1: "data/{sample_id}.read1.fastq.gz" - sample_fastq_2: "data/{sample_id}.read2.fastq.gz" - transcriptome: - quant_to: "hg38" - - sequenza: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - scratch_subdirectories: [] - - lofreq: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - lofreq_filter: "{MODSDIR}/src/bash/lofreq_filter.sh" - switches: - # Intentionally running LoFreq without a BED file for simplicity - # And to avoid having to include a large BED file in the repo - regions_bed: - _default: "" - capture: "" - - gridss: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - references: - # See the current gridss module config file for details about where to obtain this file. - viral_fa: "/projects/rmorin/projects/DLBCL_DHITsig_genomes/reference/gridss/refgenomes/human_virus/human_virus.fa" - viral_bwa_prefix: "/projects/rmorin/projects/DLBCL_DHITsig_genomes/reference/gridss/refgenomes/human_virus/human_virus.fa" - pon_dir: "/projects/rmorin/reference/hmftools-references/gridss/pon" - - strelka: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - # if using manta output, use vcf file in the 99-outputs subdirectory and ensure manta version corresponds to the loaded module - candidate_small_indels: "results/manta-2.3/99-outputs/vcf/{seq_type}--{genome_build}/candidateSmallIndels/{tumour_id}--{normal_id}--{pair_status}.candidateSmallIndels.vcf" - - utils: - inputs: - bed: - grch37: "data/exome_bed/hg19/target_regions.nochr.bed" # make sure this corresponds with config["lcr-modules"]["picard_qc"]["inputs"]["intervals"] - # if testing on GSC, use this file: "/projects/dscott_prj/CCSRI_1500/exomes/ref/agilent/hg19/target_regions.nochr.bed" - mem_mb: - bam_sort: 48000 - threads: - bam_sort: 12 - - picard_qc: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - switches: - capture_intervals: - _default: "reference/exomes/grch37/interval/target_regions.nochr_intervals.txt" - # if 'capture_kit_id' is a column in samples.tsv and contain more than one kit_id, specify each kit using the values in the column. e.g. and add the corresponding bed file if needed - # S07604624: "reference/exomes/grch37/interval/S07604624_intervals.txt" - # : "reference/exomes/grch38/interval/_intervals.txt" - - bwa_mem: - inputs: - sample_fastq_1: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read1.fastq.gz" - sample_fastq_2: "results/bam2fastq-1.2/99-outputs/{seq_type}/{sample_id}.read2.fastq.gz" - scratch_subdirectories: [] - - controlfreec: - inputs: - sample_bam: "data/{sample_id}.bam" - sample_bai: "data/{sample_id}.bam.bai" - scratch_subdirectories: [] # mpileup should be in scratch space - - starfish: - dirs: - _parent: "results/starfish-2.0_strelka-1.1_lofreq-1.0" - inputs: - names: ["strelka", "lofreq"] - paths: - [ - "results/strelka-1.1/99-outputs/vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.strelka.combined.vcf.gz", - "results/lofreq-1.0/99-outputs/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.lofreq.snvs.vcf.gz" - ] - - sage: - inputs: - # Available wildcards: {seq_type} {genome_build} {sample_id} - sample_bam: "data/{sample_id}.bam" - - # include here any additional flags to modify default parameters - options: - sage_run: "" diff --git a/demo/data/TCRBOA7-N-WGS.bam b/demo/data/TCRBOA7-N-WGS.bam new file mode 100644 index 000000000..e69de29bb diff --git a/demo/data/TCRBOA7-N-WGS.bam.bai b/demo/data/TCRBOA7-N-WGS.bam.bai new file mode 100644 index 000000000..e69de29bb diff --git a/demo/data/TCRBOA7-T-WGS.bam b/demo/data/TCRBOA7-T-WGS.bam new file mode 100644 index 000000000..e69de29bb diff --git a/demo/data/TCRBOA7-T-WGS.bam.bai b/demo/data/TCRBOA7-T-WGS.bam.bai new file mode 100644 index 000000000..e69de29bb diff --git a/demo/data/samples.tsv b/demo/data/samples.tsv index 1efd24dc1..158f6972b 100755 --- a/demo/data/samples.tsv +++ b/demo/data/samples.tsv @@ -2,3 +2,5 @@ sample_id seq_type patient_id tissue_status genome_build strand read_length TCRBOA7-N-WEX capture TCRBOA7 normal grch37 positive 100 TCRBOA7-T-WEX capture TCRBOA7 tumour grch37 positive 100 TCRBOA7-T-RNA mrna TCRBOA7 tumour grch37 positive 100 +TCRBOA7-N-WGS genome TCRBOA7 normal grch37 positive 100 +TCRBOA7-T-WGS genome TCRBOA7 tumour grch37 positive 100 diff --git a/demo/dry-run.sh b/demo/dry-run.sh index 326a267e7..eba908957 100755 --- a/demo/dry-run.sh +++ b/demo/dry-run.sh @@ -1,6 +1,18 @@ #!/bin/bash +# Launches a snakefile of your choice in dry run mode (for debugging) +# Usage: ./dry_run.sh "" +# Example: ./dry_run.sh example.smk example_all +# snakefile.smk The snakefile you want to run +# target_rule: The name of one of the target rules specified in one of the included Snakefiles +# snakemake_flags: One or more flags for the snakemake to run, specified inside quotation marks + + # Default to all targets -TARGETS=${@:-all} +snakefile=$1 +TARGETS=${2:-all} +snakemake_flags=$3 + +snakemake --dryrun --cores 24 $snakemake_flags -s $snakefile --printshellcmds --reason --use-conda $TARGETS + -snakemake --dryrun --cores 24 --printshellcmds --reason --use-conda $TARGETS diff --git a/demo/Snakefile b/demo/genome_Snakefile.smk similarity index 54% rename from demo/Snakefile rename to demo/genome_Snakefile.smk index 6f1872750..13928f35d 100755 --- a/demo/Snakefile +++ b/demo/genome_Snakefile.smk @@ -1,12 +1,15 @@ #!/usr/bin/env snakemake - +''' +This Snakefile is made to run all the modules compatible with Genome workflow. +Compatibility of a workflow can be checked by referring to the pairing_config parameter present in a default.yaml file of that module. +''' ##### SETUP ##### import oncopipe as op SAMPLES = op.load_samples("data/samples.tsv") -CAPTURE = op.filter_samples(SAMPLES, seq_type = "capture") +GENOME = op.filter_samples(SAMPLES, seq_type = "genome") ##### REFERENCE_FILES WORKFLOW ##### @@ -24,58 +27,49 @@ subworkflow reference_files: ##### CONFIGURATION FILES ##### + # Load module-specific configuration -configfile: "../modules/utils/2.1/config/default.yaml" + configfile: "../modules/picard_qc/1.0/config/default.yaml" -configfile: "../modules/salmon/1.1/config/default.yaml" configfile: "../modules/bam2fastq/1.2/config/default.yaml" -configfile: "../modules/star/1.4/config/default.yaml" -configfile: "../modules/manta/2.3/config/default.yaml" -configfile: "../modules/gridss/1.1/config/default.yaml" -configfile: "../modules/vcf2maf/1.2/config/default.yaml" configfile: "../modules/sequenza/1.4/config/default.yaml" -configfile: "../modules/strelka/1.1/config/default.yaml" configfile: "../modules/bwa_mem/1.1/config/default.yaml" -configfile: "../modules/controlfreec/1.1/config/default.yaml" -configfile: "../modules/lofreq/1.0/config/default.yaml" -configfile: "../modules/starfish/2.0/config/default.yaml" -configfile: "../modules/sage/1.0/config/default.yaml" +configfile: "../modules/controlfreec/1.2/config/default.yaml" configfile: "../modules/slms_3/1.0/config/default.yaml" +configfile: "../modules/gridss/1.1/config/default.yaml" +configfile: "../modules/liftover/1.2/config/default.yaml" +configfile: "../modules/battenberg/1.2/config/default.yaml" +configfile: "../modules/pathseq/1.0/config/default.yaml" +configfile: "../modules/utils/2.1/config/default.yaml" # Load project-specific config, which includes the shared # configuration and some module-specific config updates -configfile: "config.yaml" +configfile: "genome_config.yaml" ##### CONFIGURATION UPDATES ##### # Use all samples as a default sample list for each module -config["lcr-modules"]["_shared"]["samples"] = SAMPLES -config["lcr-modules"]["starfish"]["samples"] = CAPTURE +config["lcr-modules"]["_shared"]["samples"] = GENOME ##### MODULE SNAKEFILES ##### # Load module-specific snakefiles -include: "../modules/slms_3/1.0/slms_3.smk" -include: "../modules/utils/2.1/utils.smk" include: "../modules/picard_qc/1.0/picard_qc.smk" -include: "../modules/salmon/1.1/salmon.smk" -include: "../modules/star/1.4/star.smk" -include: "../modules/manta/2.3/manta.smk" -include: "../modules/vcf2maf/1.2/vcf2maf.smk" +include: "../modules/bam2fastq/1.2/bam2fastq.smk" include: "../modules/sequenza/1.4/sequenza.smk" -include: "../modules/strelka/1.1/strelka.smk" include: "../modules/bwa_mem/1.1/bwa_mem.smk" +include: "../modules/controlfreec/1.2/controlfreec.smk" +include: "../modules/slms_3/1.0/slms_3.smk" include: "../modules/gridss/1.1/gridss.smk" -include: "../modules/bam2fastq/1.2/bam2fastq.smk" -include: "../modules/controlfreec/1.1/controlfreec.smk" -include: "../modules/lofreq/1.0/lofreq.smk" -include: "../modules/starfish/2.0/starfish.smk" -include: "../modules/sage/1.0/sage.smk" +include: "../modules/liftover/1.2/liftover.smk" +include: "../modules/battenberg/1.2/battenberg.smk" +include: "../modules/pathseq/1.0/pathseq.smk" +include: "../modules/utils/2.1/utils.smk" ##### TARGETS ###### @@ -83,17 +77,12 @@ include: "../modules/sage/1.0/sage.smk" rule all: input: rules._picard_qc_all.input, - rules._salmon_all.input, rules._bam2fastq_all.input, - rules._star_all.input, - rules._manta_all.input, rules._sequenza_all.input, - rules._lofreq_all.input, - rules._strelka_all.input, rules._bwa_mem_all.input, - rules._gridss_all.input, rules._controlfreec_all.input, - rules._starfish_all.input, - rules._vcf2maf_all.input, - rules._sage_all.input, - rules._slms_3_all.input + rules._slms_3_all.input, + rules._gridss_all.input, + rules._liftover_all.input, + rules._battenberg_all.input, + rules._pathseq_all.input diff --git a/demo/genome_config.yaml b/demo/genome_config.yaml new file mode 100644 index 000000000..a92c9c78f --- /dev/null +++ b/demo/genome_config.yaml @@ -0,0 +1,90 @@ +lcr-modules: + _shared: + lcr-modules: "../" + lcr-scripts: "../../lcr-scripts/" + root_output_dir: "results/" + scratch_directory: "scratch/" + unmatched_normal_ids: + genome--grch37: "TCRBOA7-N-WGS" + + picard_qc: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + switches: + capture_intervals: + _default: "reference/exomes/grch37/interval/target_regions.nochr_intervals.txt" + # if 'capture_kit_id' is a column in samples.tsv and contain more than one kit_id, specify each kit using the values in the column. e.g. and add the corresponding bed file if needed + # S07604624: "reference/exomes/grch37/interval/S07604624_intervals.txt" + # : "reference/exomes/grch38/interval/_intervals.txt" + + bam2fastq: + inputs: + sample_bam: "data/{sample_id}.bam" + temp_outputs: True # fastq outputs will be temporary + + sequenza: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + scratch_subdirectories: [] + + bwa_mem: + inputs: + sample_fastq_1: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read1.fastq.gz" + sample_fastq_2: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read2.fastq.gz" + scratch_subdirectories: [] + + controlfreec: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + scratch_subdirectories: [] # mpileup should be in scratch space + + slms_3: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + + gridss: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + + mixcr: + inputs: + sample_fastq_1: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read1.fastq.gz" + sample_fastq_2: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read2.fastq.gz" + # Path to the directory where MIXCR should be installed + mixcr_exec: "data" + + liftover: + tool: "sequenza" + dirs: + _parent: "results/sequenza-1.4_liftover-1.2" + inputs: + sample_seg: "results/sequenza-1.4/99-outputs/filtered_seg/{seq_type}--{genome_build}/{tumour_sample_id}--{normal_sample_id}--matched.igv.seg" + + battenberg: + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "data/{sample_id}.bam" + + pathseq: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + + options: + min_read_length: 49 + ebv_cutoff: [0.00004, 0.00008] + + utils: + inputs: + bed: + grch37: "data/exome_bed/hg19/target_regions.nochr.bed" # make sure this corresponds with config["lcr-modules"]["picard_qc"]["inputs"]["intervals"] + # if testing on GSC, use this file: "/projects/dscott_prj/CCSRI_1500/exomes/ref/agilent/hg19/target_regions.nochr.bed" + mem_mb: + bam_sort: 48000 + threads: + bam_sort: 12 diff --git a/demo/mrna_Snakefile.smk b/demo/mrna_Snakefile.smk new file mode 100755 index 000000000..096adbf2c --- /dev/null +++ b/demo/mrna_Snakefile.smk @@ -0,0 +1,80 @@ +#!/usr/bin/env snakemake + +''' +This Snakefile is made to run all the modules compatible with Mrna workflow. +Compatibility of a workflow can be checked by referring to the pairing_config parameter present in a default.yaml file of that module. +''' +##### SETUP ##### + +import oncopipe as op + +SAMPLES = op.load_samples("data/samples.tsv") +MRNA = op.filter_samples(SAMPLES, seq_type = "mrna") + + +##### REFERENCE_FILES WORKFLOW ##### + + +subworkflow reference_files: + workdir: + "reference/" + snakefile: + "../workflows/reference_files/2.4/reference_files.smk" + configfile: + "../workflows/reference_files/2.4/config/default.yaml" + + +##### CONFIGURATION FILES ##### + + +# Load module-specific configuration + +configfile: "../modules/picard_qc/1.0/config/default.yaml" +configfile: "../modules/salmon/1.1/config/default.yaml" +configfile: "../modules/star/1.4/config/default.yaml" +configfile: "../modules/bam2fastq/1.2/config/default.yaml" +configfile: "../modules/manta/2.3/config/default.yaml" +configfile: "../modules/mixcr/1.1/config/default.yaml" +configfile: "../modules/pathseq/1.0/config/default.yaml" +configfile: "../modules/stringtie/1.0/config/default.yaml" +configfile: "../modules/utils/2.1/config/default.yaml" + + +# Load project-specific config, which includes the shared +# configuration and some module-specific config updates +configfile: "mrna_config.yaml" + + +##### CONFIGURATION UPDATES ##### + + +# Use all samples as a default sample list for each module +config["lcr-modules"]["_shared"]["samples"] = MRNA + +##### MODULE SNAKEFILES ##### + + +# Load module-specific snakefiles + +include: "../modules/picard_qc/1.0/picard_qc.smk" +include: "../modules/salmon/1.1/salmon.smk" +include: "../modules/star/1.4/star.smk" +include: "../modules/bam2fastq/1.2/bam2fastq.smk" +include: "../modules/manta/2.3/manta.smk" +include: "../modules/mixcr/1.1/mixcr.smk" +include: "../modules/pathseq/1.0/pathseq.smk" +include: "../modules/stringtie/1.0/stringtie.smk" +include: "../modules/utils/2.1/utils.smk" + +##### TARGETS ###### + +rule all: + input: + rules._picard_qc_all.input, + rules._salmon_all.input, + rules._star_all.input, + rules._bam2fastq_all.input, + rules._manta_all.input, + rules._mixcr_all.input, + rules._pathseq_all.input, + rules._stringtie_all.input diff --git a/demo/mrna_config.yaml b/demo/mrna_config.yaml new file mode 100644 index 000000000..29f459ff3 --- /dev/null +++ b/demo/mrna_config.yaml @@ -0,0 +1,85 @@ +lcr-modules: + _shared: + lcr-modules: "../" + lcr-scripts: "../../lcr-scripts/" + root_output_dir: "results/" + scratch_directory: "scratch/" + + picard_qc: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + switches: + capture_intervals: + _default: "reference/exomes/grch37/interval/target_regions.nochr_intervals.txt" + # if 'capture_kit_id' is a column in samples.tsv and contain more than one kit_id, specify each kit using the values in the column. e.g. and add the corresponding bed file if needed + # S07604624: "reference/exomes/grch37/interval/S07604624_intervals.txt" + # : "reference/exomes/grch38/interval/_intervals.txt" + + salmon: + inputs: + sample_fastq_1: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read1.fastq.gz" + sample_fastq_2: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read2.fastq.gz" + transcriptome: + quant_to: "hg38" + + star: + inputs: + sample_fastq_1: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read1.fastq.gz" + sample_fastq_2: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read2.fastq.gz" + scratch_subdirectories: [] + + bam2fastq: + inputs: + sample_bam: "data/{sample_id}.bam" + temp_outputs: True # fastq outputs will be temporary + + manta: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + + mixcr: + inputs: + sample_fastq_1: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read1.fastq.gz" + sample_fastq_2: "results/bam2fastq-1.2/01-fastq/{seq_type}/{sample_id}.read2.fastq.gz" + # Path to the directory where MIXCR should be installed + mixcr_exec: "data" + + pathseq: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + options: + min_read_length: 49 + ebv_cutoff: [0.00004, 0.00008] + + stringtie: + inputs: + sample_bam: "data/{sample_id}.bam" + sample_bai: "data/{sample_id}.bam.bai" + XS_script: "{MODSDIR}/src/tagXSstrandedData.awk" + options: + stringtie_run: "" + conda_envs: + stringtie: "{MODSDIR}/envs/stringtie.yaml" + threads: + stringtie_run: 16 + resources: + stringtie_run: + mem_mb: 15000 + pairing_config: + mrna: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: True + + utils: + inputs: + bed: + grch37: "data/exome_bed/hg19/target_regions.nochr.bed" # make sure this corresponds with config["lcr-modules"]["picard_qc"]["inputs"]["intervals"] + # if testing on GSC, use this file: "/projects/dscott_prj/CCSRI_1500/exomes/ref/agilent/hg19/target_regions.nochr.bed" + mem_mb: + bam_sort: 48000 + threads: + bam_sort: 12 diff --git a/demo/run.sh b/demo/run.sh index aeda0d1eb..8e711e965 100755 --- a/demo/run.sh +++ b/demo/run.sh @@ -1,7 +1,18 @@ #!/bin/bash + +# Launches a snakefile of your choice in dry run mode (for debugging) +# Usage: ./dry_run.sh "" +# Example: ./dry_run.sh example.smk example_all +# snakefile.smk The snakefile you want to run +# target_rule: The name of one of the target rules specified in one of the included Snakefiles +# snakemake_flags: One or more flags for the snakemake to run, specified inside quotation marks + + # Default to all targets -TARGETS=${@:-all} +snakefile=$1 +TARGETS=${2:-all} +snakemake_flags=$3 # Determine the number of available cores for parallelization NUM_CORES=$(grep -c '^processor' /proc/cpuinfo) @@ -17,4 +28,6 @@ if (( $CORES_AVAILABLE <= 0 )); then echo "Check out top/htop to see what other jobs are currently running." exit 1 fi -nice -n 10 snakemake --cores "${CORES_AVAILABLE}" --keep-going --latency-wait 120 --use-conda "$TARGETS" +nice -n 10 snakemake --cores "${CORES_AVAILABLE}" $snakemake_flags -s $snakefile --keep-going --latency-wait 120 --use-conda $TARGETS + + diff --git a/docs/source/for_developers.rst b/docs/source/for_developers.rst index 71a17ffe8..dfe1a86fb 100644 --- a/docs/source/for_developers.rst +++ b/docs/source/for_developers.rst @@ -21,7 +21,7 @@ Getting Started # conda create -n lcr-modules "python>=3.6" # conda activate lcr-modules - conda install cookiecutter git + conda install -c conda-forge cookiecutter 4. Clone the `lcr-modules repository`_ and the `lcr-scripts repository`_. diff --git a/envs/gridss/gridss-2.12.0.yaml b/envs/gridss/gridss-2.12.0.yaml new file mode 100644 index 000000000..11a3f1c90 --- /dev/null +++ b/envs/gridss/gridss-2.12.0.yaml @@ -0,0 +1,341 @@ +name: gridss-2.11.1 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - bcftools=1.12 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-annotationdbi=1.52.0 + - bioconductor-biobase=2.50.0 + - bioconductor-biocfilecache=1.14.0 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-biocparallel=1.24.1 + - bioconductor-biomart=2.46.3 + - bioconductor-biostrings=2.58.0 + - bioconductor-bsgenome=1.58.0 + - bioconductor-delayedarray=0.16.3 + - bioconductor-genomeinfodb=1.26.4 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicalignments=1.26.0 + - bioconductor-genomicfeatures=1.42.2 + - bioconductor-genomicranges=1.42.0 + - bioconductor-iranges=2.24.1 + - bioconductor-matrixgenerics=1.2.1 + - bioconductor-rhtslib=1.22.0 + - bioconductor-rsamtools=2.6.0 + - bioconductor-rtracklayer=1.50.0 + - bioconductor-s4vectors=0.28.1 + - bioconductor-structuralvariantannotation=1.6.0 + - bioconductor-summarizedexperiment=1.20.0 + - bioconductor-variantannotation=1.36.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - blast=2.11.0 + - bwa=0.7.17 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.7.5 + - cached-property=1.5.2 + - cached_property=1.5.2 + - cairo=1.16.0 + - curl=7.76.0 + - entrez-direct=13.9 + - expat=2.3.0 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - giflib=5.2.1 + - gmp=6.1.2 + - gnutls=3.5.19 + - graphite2=1.3.14 + - gridss=2.12.0 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - h5py=3.3.0 + - harfbuzz=2.8.0 + - hdf5=1.10.6 + - hmmer=3.3.2 + - htslib=1.12 + - icu=68.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - kraken2=2.1.1 + - krb5=1.17.2 + - lcms2=2.12 + - ld_impl_linux-64=2.35.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.76.0 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=11.1.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.0 + - libgomp=11.1.0 + - libiconv=1.16 + - libidn2=2.3.0 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.12 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libunistring=0.9.10 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.10 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - nettle=3.3 + - numpy=1.21.2 + - openjdk=11.0.9.1 + - openssl=1.1.1k + - pandoc=2.12 + - pango=1.48.4 + - pcre=8.44 + - pcre2=10.36 + - perl=5.26.2 + - perl-app-cpanminus=1.7044 + - perl-archive-tar=2.32 + - perl-base=2.23 + - perl-business-isbn=3.004 + - perl-business-isbn-data=20140910.003 + - perl-carp=1.38 + - perl-common-sense=3.74 + - perl-compress-raw-bzip2=2.087 + - perl-compress-raw-zlib=2.087 + - perl-constant=1.33 + - perl-data-dumper=2.173 + - perl-digest-hmac=1.03 + - perl-digest-md5=2.55 + - perl-encode=2.88 + - perl-encode-locale=1.05 + - perl-exporter=5.72 + - perl-exporter-tiny=1.002001 + - perl-extutils-makemaker=7.36 + - perl-file-listing=6.04 + - perl-file-path=2.16 + - perl-file-temp=0.2304 + - perl-html-parser=3.72 + - perl-html-tagset=3.20 + - perl-html-tree=5.07 + - perl-http-cookies=6.04 + - perl-http-daemon=6.01 + - perl-http-date=6.02 + - perl-http-message=6.18 + - perl-http-negotiate=6.01 + - perl-io-compress=2.087 + - perl-io-html=1.001 + - perl-io-socket-ssl=2.066 + - perl-io-zlib=1.10 + - perl-json=4.02 + - perl-json-xs=2.34 + - perl-libwww-perl=6.39 + - perl-list-moreutils=0.428 + - perl-list-moreutils-xs=0.428 + - perl-lwp-mediatypes=6.04 + - perl-lwp-protocol-https=6.07 + - perl-mime-base64=3.15 + - perl-mozilla-ca=20180117 + - perl-net-http=6.19 + - perl-net-ssleay=1.88 + - perl-ntlm=1.09 + - perl-parent=0.236 + - perl-pathtools=3.75 + - perl-scalar-list-utils=1.52 + - perl-socket=2.027 + - perl-storable=3.15 + - perl-test-requiresinternet=0.05 + - perl-time-local=1.28 + - perl-try-tiny=0.30 + - perl-types-serialiser=1.0 + - perl-uri=1.76 + - perl-www-robotrules=6.02 + - perl-xml-namespacesupport=1.12 + - perl-xml-parser=2.44_01 + - perl-xml-sax=1.02 + - perl-xml-sax-base=1.09 + - perl-xml-sax-expat=0.51 + - perl-xml-simple=2.25 + - perl-xsloader=0.24 + - pip=21.2.4 + - pixman=0.40.0 + - popt=1.16 + - python=3.9.6 + - python_abi=3.9 + - r-argparser=0.7.1 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.2.1 + - r-base=4.0.3 + - r-base64enc=0.1_3 + - r-bh=1.75.0_0 + - r-bit=4.0.4 + - r-bit64=4.0.5 + - r-bitops=1.0_6 + - r-blob=1.2.1 + - r-brio=1.1.1 + - r-broom=0.7.6 + - r-cachem=1.0.4 + - r-callr=3.6.0 + - r-cellranger=1.1.0 + - r-cli=2.4.0 + - r-clipr=0.7.1 + - r-colorspace=2.0_0 + - r-cpp11=0.2.7 + - r-crayon=1.4.1 + - r-curl=4.3 + - r-dbi=1.1.1 + - r-dbplyr=2.1.1 + - r-desc=1.3.0 + - r-diffobj=0.3.3 + - r-digest=0.6.27 + - r-dplyr=1.0.5 + - r-ellipsis=0.3.1 + - r-evaluate=0.14 + - r-fansi=0.4.2 + - r-farver=2.1.0 + - r-fastmap=1.1.0 + - r-forcats=0.5.1 + - r-formatr=1.8 + - r-fs=1.5.0 + - r-futile.logger=1.4.3 + - r-futile.options=1.0.1 + - r-generics=0.1.0 + - r-ggplot2=3.3.3 + - r-glue=1.4.2 + - r-gtable=0.3.0 + - r-haven=2.3.1 + - r-highr=0.8 + - r-hms=1.0.0 + - r-htmltools=0.5.1.1 + - r-httr=1.4.2 + - r-isoband=0.2.4 + - r-jsonlite=1.7.2 + - r-knitr=1.31 + - r-labeling=0.4.2 + - r-lambda.r=1.2.4 + - r-lattice=0.20_41 + - r-lifecycle=1.0.0 + - r-lubridate=1.7.10 + - r-magrittr=2.0.1 + - r-markdown=1.1 + - r-mass=7.3_53.1 + - r-matrix=1.3_2 + - r-matrixstats=0.58.0 + - r-memoise=2.0.0 + - r-mgcv=1.8_34 + - r-mime=0.10 + - r-modelr=0.1.8 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-openssl=1.4.3 + - r-pillar=1.5.1 + - r-pkgconfig=2.0.3 + - r-pkgload=1.2.1 + - r-plogr=0.2.0 + - r-plyr=1.8.6 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.5.1 + - r-progress=1.2.2 + - r-ps=1.6.0 + - r-purrr=0.3.4 + - r-r6=2.5.0 + - r-rappdirs=0.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.3 + - r-readr=1.4.0 + - r-readxl=1.3.1 + - r-rematch=1.0.1 + - r-rematch2=2.1.2 + - r-reprex=1.0.0 + - r-reshape2=1.4.4 + - r-rlang=0.4.10 + - r-rmarkdown=2.7 + - r-rprojroot=2.0.2 + - r-rsqlite=2.2.5 + - r-rstudioapi=0.13 + - r-rvest=1.0.0 + - r-scales=1.1.1 + - r-selectr=0.4_2 + - r-snow=0.4_3 + - r-stringdist=0.9.6.3 + - r-stringi=1.5.3 + - r-stringr=1.4.0 + - r-sys=3.4 + - r-testthat=3.0.2 + - r-tibble=3.1.0 + - r-tidyr=1.1.3 + - r-tidyselect=1.1.0 + - r-tidyverse=1.3.0 + - r-tinytex=0.31 + - r-utf8=1.2.1 + - r-vctrs=0.3.7 + - r-viridislite=0.3.0 + - r-waldo=0.2.5 + - r-withr=2.4.1 + - r-xfun=0.20 + - r-xml=3.99_0.6 + - r-xml2=1.3.2 + - r-yaml=2.2.1 + - r-zeallot=0.1.0 + - readline=8.1 + - repeatmasker=4.1.2.p1 + - rmblast=2.10.0 + - rsync=3.2.3 + - samtools=1.12 + - sed=4.8 + - setuptools=57.4.0 + - sqlite=3.36.0 + - sysroot_linux-64=2.12 + - tar=1.34 + - tk=8.6.10 + - tktable=2.10 + - trf=4.09.1 + - tzdata=2021a + - wget=1.20.1 + - wheel=0.37.0 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.6.12 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xxhash=0.8.0 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.9 +prefix: /home/lhilton/miniconda3/envs/gridss-2.11.1 diff --git a/envs/hmftools/hmftools-amber-3.5.yaml b/envs/hmftools/hmftools-amber-3.5.yaml new file mode 100644 index 000000000..1d76d1931 --- /dev/null +++ b/envs/hmftools/hmftools-amber-3.5.yaml @@ -0,0 +1,119 @@ +name: hmftools-amber-3.5 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-copynumber=1.30.0 + - bioconductor-genomeinfodb=1.26.4 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicranges=1.42.0 + - bioconductor-iranges=2.24.1 + - bioconductor-s4vectors=0.28.1 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.4.13 + - cairo=1.16.0 + - curl=7.76.1 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - giflib=5.2.1 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - hmftools-amber=3.5 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - lcms2=2.12 + - ld_impl_linux-64=2.35.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.76.1 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.2 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openjdk=11.0.9.1 + - openssl=1.1.1k + - pango=1.48.5 + - pcre=8.44 + - pcre2=10.36 + - pixman=0.40.0 + - r-base=4.0.5 + - r-bitops=1.0_7 + - r-rcurl=1.98_1.3 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.1 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/hmftools-amber-3.5 diff --git a/envs/hmftools/hmftools-cobalt-1.11.yaml b/envs/hmftools/hmftools-cobalt-1.11.yaml new file mode 100644 index 000000000..df0964fa4 --- /dev/null +++ b/envs/hmftools/hmftools-cobalt-1.11.yaml @@ -0,0 +1,119 @@ +name: hmftools-cobalt-1.11 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-copynumber=1.30.0 + - bioconductor-genomeinfodb=1.26.4 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicranges=1.42.0 + - bioconductor-iranges=2.24.1 + - bioconductor-s4vectors=0.28.1 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.4.13 + - cairo=1.16.0 + - curl=7.76.1 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - giflib=5.2.1 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - hmftools-cobalt=1.11 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - lcms2=2.12 + - ld_impl_linux-64=2.35.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.76.1 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.2 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openjdk=11.0.9.1 + - openssl=1.1.1k + - pango=1.48.5 + - pcre=8.44 + - pcre2=10.36 + - pixman=0.40.0 + - r-base=4.0.5 + - r-bitops=1.0_7 + - r-rcurl=1.98_1.3 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.1 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/hmftools-cobalt-1.11 diff --git a/envs/hmftools/hmftools-gripss-1.11.yaml b/envs/hmftools/hmftools-gripss-1.11.yaml new file mode 100644 index 000000000..7d37fcaf1 --- /dev/null +++ b/envs/hmftools/hmftools-gripss-1.11.yaml @@ -0,0 +1,58 @@ +name: gripss-1.11 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - alsa-lib=1.2.3 + - cairo=1.16.0 + - fontconfig=2.13.1 + - freetype=2.10.4 + - gettext=0.21.0 + - giflib=5.2.1 + - graphite2=1.3.14 + - harfbuzz=2.9.0 + - hmftools-gripss=1.11 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - lcms2=2.12 + - lerc=2.2.1 + - libdeflate=1.7 + - libffi=3.3 + - libgcc-ng=11.1.0 + - libglib=2.68.4 + - libgomp=11.1.0 + - libiconv=1.16 + - libpng=1.6.37 + - libstdcxx-ng=11.1.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.1 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - openjdk=11.0.9.1 + - pcre=8.45 + - pixman=0.40.0 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.2 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/gripss-1.11 diff --git a/envs/hmftools/hmftools-linx-1.15.yaml b/envs/hmftools/hmftools-linx-1.15.yaml new file mode 100644 index 000000000..18ca525eb --- /dev/null +++ b/envs/hmftools/hmftools-linx-1.15.yaml @@ -0,0 +1,303 @@ +name: hmftools-linx-1.15 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - atk-1.0=2.36.0 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-annotationdbi=1.52.0 + - bioconductor-annotationfilter=1.14.0 + - bioconductor-biobase=2.50.0 + - bioconductor-biocfilecache=1.14.0 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-biocparallel=1.24.1 + - bioconductor-biomart=2.46.3 + - bioconductor-biostrings=2.58.0 + - bioconductor-biovizbase=1.38.0 + - bioconductor-bsgenome=1.58.0 + - bioconductor-delayedarray=0.16.3 + - bioconductor-ensembldb=2.14.0 + - bioconductor-genomeinfodb=1.26.4 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicalignments=1.26.0 + - bioconductor-genomicfeatures=1.42.2 + - bioconductor-genomicranges=1.42.0 + - bioconductor-gviz=1.34.1 + - bioconductor-iranges=2.24.1 + - bioconductor-matrixgenerics=1.2.1 + - bioconductor-protgenerics=1.22.0 + - bioconductor-rhtslib=1.22.0 + - bioconductor-rsamtools=2.6.0 + - bioconductor-rtracklayer=1.50.0 + - bioconductor-s4vectors=0.28.1 + - bioconductor-summarizedexperiment=1.20.0 + - bioconductor-variantannotation=1.36.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.4.13 + - cairo=1.16.0 + - circos=0.69.6 + - curl=7.76.1 + - expat=2.3.0 + - fftw=3.3.9 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gdk-pixbuf=2.42.6 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - ghostscript=9.54.0 + - giflib=5.2.1 + - graphite2=1.3.14 + - graphviz=2.47.1 + - gsl=2.6 + - gtk2=2.24.33 + - gts=0.7.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - hmftools-linx=1.15 + - icu=68.1 + - imagemagick=7.0.11_13 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - lcms2=2.12 + - ld_impl_linux-64=2.35.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.76.1 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgd=2.3.2 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.2 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - librsvg=2.50.5 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libtool=2.4.6 + - libuuid=2.32.1 + - libwebp=1.2.0 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openjdk=11.0.9.1 + - openjpeg=2.4.0 + - openssl=1.1.1k + - pango=1.48.5 + - pcre=8.44 + - pcre2=10.36 + - perl=5.26.2 + - perl-autoloader=5.74 + - perl-carp=1.38 + - perl-clone=0.42 + - perl-config-general=2.63 + - perl-digest-perl-md5=1.9 + - perl-dynaloader=1.25 + - perl-exporter=5.72 + - perl-exporter-tiny=1.002001 + - perl-extutils-makemaker=7.36 + - perl-font-ttf=1.06 + - perl-gd=2.68 + - perl-io-string=1.08 + - perl-list-moreutils=0.428 + - perl-list-moreutils-xs=0.428 + - perl-math-bezier=0.01 + - perl-math-round=0.07 + - perl-math-vecstat=0.08 + - perl-module-implementation=0.09 + - perl-module-runtime=0.016 + - perl-number-format=1.75 + - perl-params-validate=1.29 + - perl-pathtools=3.75 + - perl-readonly=2.05 + - perl-regexp-common=2017060201 + - perl-scalar-list-utils=1.52 + - perl-set-intspan=1.19 + - perl-statistics-basic=1.6611 + - perl-svg=2.84 + - perl-text-format=0.59 + - perl-time-hires=1.9760 + - perl-try-tiny=0.30 + - perl-xml-parser=2.44_01 + - perl-xsloader=0.24 + - pixman=0.40.0 + - pkg-config=0.29.2 + - r-acepack=1.4.1 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.2.1 + - r-base=4.0.5 + - r-base64enc=0.1_3 + - r-bh=1.75.0_0 + - r-bit=4.0.4 + - r-bit64=4.0.5 + - r-bitops=1.0_7 + - r-blob=1.2.1 + - r-brio=1.1.2 + - r-cachem=1.0.5 + - r-callr=3.7.0 + - r-checkmate=2.0.0 + - r-cli=2.5.0 + - r-cluster=2.1.2 + - r-colorspace=2.0_1 + - r-cowplot=1.1.1 + - r-crayon=1.4.1 + - r-curl=4.3.1 + - r-data.table=1.14.0 + - r-dbi=1.1.1 + - r-dbplyr=2.1.1 + - r-desc=1.3.0 + - r-dichromat=2.0_0 + - r-diffobj=0.3.4 + - r-digest=0.6.27 + - r-dplyr=1.0.6 + - r-ellipsis=0.3.2 + - r-evaluate=0.14 + - r-fansi=0.4.2 + - r-farver=2.1.0 + - r-fastmap=1.1.0 + - r-foreign=0.8_81 + - r-formatr=1.9 + - r-formula=1.2_4 + - r-futile.logger=1.4.3 + - r-futile.options=1.0.1 + - r-generics=0.1.0 + - r-ggplot2=3.3.3 + - r-glue=1.4.2 + - r-gridextra=2.3 + - r-gtable=0.3.0 + - r-highr=0.9 + - r-hmisc=4.5_0 + - r-hms=1.1.0 + - r-htmltable=2.2.1 + - r-htmltools=0.5.1.1 + - r-htmlwidgets=1.5.3 + - r-httr=1.4.2 + - r-isoband=0.2.4 + - r-jpeg=0.1_8.1 + - r-jsonlite=1.7.2 + - r-knitr=1.33 + - r-labeling=0.4.2 + - r-lambda.r=1.2.4 + - r-lattice=0.20_44 + - r-latticeextra=0.6_29 + - r-lazyeval=0.2.2 + - r-lifecycle=1.0.0 + - r-magick=2.7.2 + - r-magrittr=2.0.1 + - r-markdown=1.1 + - r-mass=7.3_54 + - r-matrix=1.3_3 + - r-matrixstats=0.58.0 + - r-memoise=2.0.0 + - r-mgcv=1.8_35 + - r-mime=0.10 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-nnet=7.3_16 + - r-openssl=1.4.4 + - r-pillar=1.6.1 + - r-pkgconfig=2.0.3 + - r-pkgload=1.2.1 + - r-plogr=0.2.0 + - r-png=0.1_7 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.5.2 + - r-progress=1.2.2 + - r-ps=1.6.0 + - r-purrr=0.3.4 + - r-r6=2.5.0 + - r-rappdirs=0.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.3 + - r-rematch2=2.1.2 + - r-rlang=0.4.11 + - r-rpart=4.1_15 + - r-rprojroot=2.0.2 + - r-rsqlite=2.2.5 + - r-rstudioapi=0.13 + - r-scales=1.1.1 + - r-snow=0.4_3 + - r-stringi=1.6.2 + - r-stringr=1.4.0 + - r-survival=3.2_11 + - r-sys=3.4 + - r-testthat=3.0.2 + - r-tibble=3.1.2 + - r-tidyr=1.1.3 + - r-tidyselect=1.1.1 + - r-utf8=1.2.1 + - r-vctrs=0.3.8 + - r-viridis=0.6.1 + - r-viridislite=0.4.0 + - r-waldo=0.2.5 + - r-withr=2.4.2 + - r-xfun=0.23 + - r-xml=3.99_0.6 + - r-xml2=1.3.2 + - r-yaml=2.2.1 + - r-zeallot=0.1.0 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.6.12 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/hmftools-linx-1.15 diff --git a/envs/hmftools/hmftools-purple-2.54.yaml b/envs/hmftools/hmftools-purple-2.54.yaml new file mode 100644 index 000000000..0d97751d1 --- /dev/null +++ b/envs/hmftools/hmftools-purple-2.54.yaml @@ -0,0 +1,260 @@ +name: hmftools-purple-2.54 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - alsa-lib=1.2.3 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-annotationdbi=1.52.0 + - bioconductor-biobase=2.50.0 + - bioconductor-biocfilecache=1.14.0 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-biocparallel=1.24.1 + - bioconductor-biomart=2.46.3 + - bioconductor-biostrings=2.58.0 + - bioconductor-bsgenome=1.58.0 + - bioconductor-delayedarray=0.16.3 + - bioconductor-genomeinfodb=1.26.4 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicalignments=1.26.0 + - bioconductor-genomicfeatures=1.42.2 + - bioconductor-genomicranges=1.42.0 + - bioconductor-iranges=2.24.1 + - bioconductor-matrixgenerics=1.2.1 + - bioconductor-rhtslib=1.22.0 + - bioconductor-rsamtools=2.6.0 + - bioconductor-rtracklayer=1.50.0 + - bioconductor-s4vectors=0.28.1 + - bioconductor-summarizedexperiment=1.20.0 + - bioconductor-variantannotation=1.36.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.4.13 + - cairo=1.16.0 + - circos=0.69.8 + - curl=7.76.1 + - expat=2.3.0 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - giflib=5.2.1 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - hmftools-purple=2.54 + - icu=68.1 + - jbig=2.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - lcms2=2.12 + - ld_impl_linux-64=2.35.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.76.1 + - libdeflate=1.7 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgd=2.3.2 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.2 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp=1.2.0 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openjdk=11.0.9.1 + - openssl=1.1.1k + - pango=1.48.5 + - pcre=8.44 + - pcre2=10.36 + - perl=5.26.2 + - perl-autoloader=5.74 + - perl-carp=1.38 + - perl-clone=0.42 + - perl-config-general=2.63 + - perl-digest-perl-md5=1.9 + - perl-dynaloader=1.25 + - perl-exporter=5.72 + - perl-exporter-tiny=1.002001 + - perl-extutils-makemaker=7.36 + - perl-font-ttf=1.06 + - perl-gd=2.68 + - perl-io-string=1.08 + - perl-list-moreutils=0.428 + - perl-list-moreutils-xs=0.428 + - perl-math-bezier=0.01 + - perl-math-round=0.07 + - perl-math-vecstat=0.08 + - perl-module-implementation=0.09 + - perl-module-runtime=0.016 + - perl-number-format=1.75 + - perl-params-validate=1.29 + - perl-pathtools=3.75 + - perl-readonly=2.05 + - perl-regexp-common=2017060201 + - perl-scalar-list-utils=1.52 + - perl-set-intspan=1.19 + - perl-statistics-basic=1.6611 + - perl-svg=2.84 + - perl-text-format=0.59 + - perl-time-hires=1.9760 + - perl-try-tiny=0.30 + - perl-xml-parser=2.44_01 + - perl-xsloader=0.24 + - pixman=0.40.0 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.2.1 + - r-base=4.0.5 + - r-bh=1.75.0_0 + - r-bit=4.0.4 + - r-bit64=4.0.5 + - r-bitops=1.0_7 + - r-blob=1.2.1 + - r-brio=1.1.2 + - r-cachem=1.0.5 + - r-cairo=1.5_12.2 + - r-callr=3.7.0 + - r-cli=2.5.0 + - r-colorspace=2.0_1 + - r-cowplot=1.1.1 + - r-crayon=1.4.1 + - r-curl=4.3.1 + - r-dbi=1.1.1 + - r-dbplyr=2.1.1 + - r-desc=1.3.0 + - r-diffobj=0.3.4 + - r-digest=0.6.27 + - r-dplyr=1.0.6 + - r-ellipsis=0.3.2 + - r-evaluate=0.14 + - r-fansi=0.4.2 + - r-farver=2.1.0 + - r-fastmap=1.1.0 + - r-formatr=1.9 + - r-futile.logger=1.4.3 + - r-futile.options=1.0.1 + - r-generics=0.1.0 + - r-ggplot2=3.3.3 + - r-glue=1.4.2 + - r-gtable=0.3.0 + - r-hms=1.1.0 + - r-httr=1.4.2 + - r-isoband=0.2.4 + - r-jsonlite=1.7.2 + - r-labeling=0.4.2 + - r-lambda.r=1.2.4 + - r-lattice=0.20_44 + - r-lifecycle=1.0.0 + - r-magrittr=2.0.1 + - r-mass=7.3_54 + - r-matrix=1.3_3 + - r-matrixstats=0.58.0 + - r-memoise=2.0.0 + - r-mgcv=1.8_35 + - r-mime=0.10 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-openssl=1.4.4 + - r-pillar=1.6.1 + - r-pkgconfig=2.0.3 + - r-pkgload=1.2.1 + - r-plogr=0.2.0 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.5.2 + - r-progress=1.2.2 + - r-ps=1.6.0 + - r-purrr=0.3.4 + - r-r6=2.5.0 + - r-rappdirs=0.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.3 + - r-rematch2=2.1.2 + - r-rlang=0.4.11 + - r-rprojroot=2.0.2 + - r-rsqlite=2.2.5 + - r-rstudioapi=0.13 + - r-scales=1.1.1 + - r-snow=0.4_3 + - r-stringi=1.6.2 + - r-stringr=1.4.0 + - r-sys=3.4 + - r-testthat=3.0.2 + - r-tibble=3.1.2 + - r-tidyr=1.1.3 + - r-tidyselect=1.1.1 + - r-utf8=1.2.1 + - r-vctrs=0.3.8 + - r-viridislite=0.4.0 + - r-waldo=0.2.5 + - r-withr=2.4.2 + - r-xml=3.99_0.6 + - r-xml2=1.3.2 + - r-zeallot=0.1.0 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.1 + - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/hmftools-purple-2.54 diff --git a/envs/rainstorm/rainstorm-0.3.yaml b/envs/rainstorm/rainstorm-0.3.yaml new file mode 100644 index 000000000..3494be888 --- /dev/null +++ b/envs/rainstorm/rainstorm-0.3.yaml @@ -0,0 +1,169 @@ +name: /projects/rmorin_scratch/conda_environments/rainstormR +channels: + - r + - bioconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.0 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-biocgenerics=0.32.0 + - bioconductor-genomeinfodb=1.22.0 + - bioconductor-genomeinfodbdata=1.2.2 + - bioconductor-genomicranges=1.38.0 + - bioconductor-iranges=2.20.0 + - bioconductor-maftools=2.2.0 + - bioconductor-massspecwavelet=1.52.0 + - bioconductor-s4vectors=0.24.0 + - bioconductor-xvector=0.26.0 + - bioconductor-zlibbioc=1.32.0 + - blas=1.0 + - bwidget=1.9.11 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2020.12.5 + - cairo=1.16.0 + - certifi=2020.12.5 + - click=7.1.2 + - coloredlogs=15.0 + - curl=7.75.0 + - cyvcf2=0.11.7 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - glib=2.68.0 + - glib-tools=2.68.0 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.0 + - htslib=1.9 + - humanfriendly=9.1 + - icu=68.1 + - intel-openmp=2020.2 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.1 + - ld_impl_linux-64=2.35.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.75.0 + - libdeflate=1.2 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.0 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.13 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.10 + - lz4-c=1.9.3 + - make=4.2.1 + - mkl=2020.2 + - mkl-service=2.3.0 + - mkl_fft=1.3.0 + - mkl_random=1.1.1 + - natsort=7.1.1 + - ncls=0.0.57 + - ncurses=6.2 + - numpy=1.19.2 + - numpy-base=1.19.2 + - openssl=1.1.1j + - pandas=1.2.3 + - pango=1.42.4 + - pcre=8.44 + - pip=21.0.1 + - pixman=0.40.0 + - pyranges=0.0.95 + - pyrle=0.0.32 + - python=3.7.10 + - python-dateutil=2.8.1 + - python_abi=3.7 + - pytz=2021.1 + - r-argparse=2.0.1 + - r-assertthat=0.2.1 + - r-base=3.6.3 + - r-bitops=1.0_6 + - r-cli=1.1.0 + - r-codetools=0.2_16 + - r-colorspace=1.4_1 + - r-crayon=1.3.4 + - r-data.table=1.12.2 + - r-dichromat=2.0_0 + - r-digest=0.6.18 + - r-doparallel=1.0.14 + - r-fansi=0.4.0 + - r-findpython=1.0.5 + - r-foreach=1.4.4 + - r-ggplot2=3.1.1 + - r-glue=1.3.1 + - r-gtable=0.3.0 + - r-iterators=1.0.10 + - r-jsonlite=1.6 + - r-labeling=0.3 + - r-lattice=0.20_38 + - r-lazyeval=0.2.2 + - r-magrittr=1.5 + - r-mass=7.3_51.3 + - r-matrix=1.3_2 + - r-mgcv=1.8_28 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-pillar=1.3.1 + - r-pkgconfig=2.0.2 + - r-plyr=1.8.4 + - r-r6=2.4.0 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.1 + - r-rcurl=1.95_4.12 + - r-reshape2=1.4.3 + - r-rlang=0.3.4 + - r-scales=1.0.0 + - r-stringi=1.4.3 + - r-stringr=1.4.0 + - r-survival=2.44_1.1 + - r-tibble=2.1.1 + - r-utf8=1.1.4 + - r-viridislite=0.3.0 + - r-waveslim=1.8.2 + - r-withr=2.1.2 + - r-wordcloud=2.6 + - readline=8.1 + - scikit-misc=0.1.3 + - sed=4.8 + - setuptools=52.0.0 + - six=1.15.0 + - sorted_nearest=0.0.32 + - sqlite=3.35.2 + - sysroot_linux-64=2.12 + - tabulate=0.8.9 + - tk=8.6.10 + - tktable=2.10 + - wheel=0.36.2 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.5 +prefix: /projects/rmorin_scratch/conda_environments/rainstormR diff --git a/envs/sigprofiler/sigprofiler-1.1.yaml b/envs/sigprofiler/sigprofiler-1.1.yaml new file mode 100644 index 000000000..44cc4d3bf --- /dev/null +++ b/envs/sigprofiler/sigprofiler-1.1.yaml @@ -0,0 +1,112 @@ +name: sigprofiler +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - arrow=1.1.1 + - binaryornot=0.4.4 + - brotlipy=0.7.0 + - bzip2=1.0.8 + - c-ares=1.17.2 + - ca-certificates=2021.5.30 + - certifi=2021.5.30 + - cffi=1.14.6 + - chardet=4.0.0 + - click=8.0.1 + - cookiecutter=1.7.3 + - cryptography=3.4.7 + - curl=7.78.0 + - expat=2.4.1 + - gettext=0.19.8.1 + - git=2.32.0 + - jinja2=2.11.3 + - jinja2-time=0.2.0 + - krb5=1.19.2 + - ld_impl_linux-64=2.36.1 + - libcurl=7.78.0 + - libedit=3.1.20191231 + - libev=4.33 + - libffi=3.3 + - libgcc-ng=11.1.0 + - libgomp=11.1.0 + - libiconv=1.16 + - libnghttp2=1.43.0 + - libssh2=1.9.0 + - libstdcxx-ng=11.1.0 + - markupsafe=1.1.1 + - ncurses=6.2 + - openssl=1.1.1k + - pcre2=10.37 + - perl=5.32.1 + - pip=21.2.4 + - poyo=0.5.0 + - pycparser=2.20 + - pyopenssl=20.0.1 + - pysocks=1.7.1 + - python=3.9.6 + - python-dateutil=2.8.2 + - python-slugify=5.0.2 + - python_abi=3.9 + - readline=8.1 + - requests=2.26.0 + - setuptools=49.6.0 + - six=1.16.0 + - sqlite=3.36.0 + - text-unidecode=1.3 + - tk=8.6.10 + - typing_extensions=3.10.0.0 + - tzdata=2021a + - unidecode=1.2.0 + - urllib3=1.26.6 + - wheel=0.37.0 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - appdirs==1.4.4 + - attrs==21.2.0 + - charset-normalizer==2.0.4 + - configargparse==1.5.2 + - cycler==0.10.0 + - datrie==0.8.2 + - docutils==0.17.1 + - gitdb==4.0.7 + - gitpython==3.1.18 + - idna==3.2 + - ipython-genutils==0.2.0 + - joblib==1.0.1 + - jsonschema==3.2.0 + - jupyter-core==4.7.1 + - kiwisolver==1.3.1 + - matplotlib==3.4.3 + - nbformat==5.1.3 + - nimfa==1.4.0 + - numpy==1.21.2 + - pandas==1.3.2 + - patsy==0.5.1 + - pillow==8.3.1 + - psutil==5.8.0 + - pyparsing==2.4.7 + - pypdf2==1.26.0 + - pyrsistent==0.18.0 + - pytz==2021.1 + - pyyaml==5.4.1 + - ratelimiter==1.2.0.post0 + - reportlab==3.6.1 + - scikit-learn==0.24.2 + - scipy==1.7.1 + - seaborn==0.11.2 + - sigprofilerextractor==1.1.3 + - sigprofilermatrixgenerator==1.1.31 + - sigprofilerplotting==1.1.17 + - smmap==4.0.0 + - snakemake==5.18.1 + - statsmodels==0.12.2 + - threadpoolctl==2.2.0 + - toposort==1.6 + - torch==1.9.0 + - traitlets==5.0.5 + - wrapt==1.12.1 +prefix: /home/prasathp/miniconda3/envs/sigprofiler diff --git a/envs/stringtie/stringtie.yaml b/envs/stringtie/stringtie.yaml new file mode 100644 index 000000000..e9591fcbd --- /dev/null +++ b/envs/stringtie/stringtie.yaml @@ -0,0 +1,43 @@ +name: stringtie +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - bzip2=1.0.8=h7f98852_4 + - c-ares=1.18.1=h7f98852_0 + - ca-certificates=2021.10.8=ha878542_0 + - htslib=1.12=h9093b5e_1 + - krb5=1.19.2=hcc1bbae_3 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - libcurl=7.81.0=h2574ce0_0 + - libdeflate=1.7=h7f98852_5 + - libedit=3.1.20191231=he28a2e2_2 + - libev=4.33=h516909a_1 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=11.2.0=h1d223b6_11 + - libgomp=11.2.0=h1d223b6_11 + - libnghttp2=1.43.0=h812cca2_1 + - libnsl=2.0.0=h7f98852_0 + - libssh2=1.10.0=ha56f1ee_2 + - libstdcxx-ng=11.2.0=he4da1e4_11 + - libuuid=2.32.1=h7f98852_1000 + - libzlib=1.2.11=h36c2ea0_1013 + - ncurses=6.2=h58526e2_4 + - openssl=1.1.1l=h7f98852_0 + - pip=21.3.1=pyhd8ed1ab_0 + - python=3.10.1=h62f1059_2_cpython + - python_abi=3.10=2_cp310 + - readline=8.1=h46c0cb4_0 + - samtools=1.12=h9aed4be_1 + - setuptools=60.5.0=py310hff52083_0 + - sqlite=3.37.0=h9cd32fc_0 + - stringtie=2.2.0=h3198e80_0 + - tk=8.6.11=h27826a3_1 + - tzdata=2021e=he74cb21_0 + - wheel=0.37.1=pyhd8ed1ab_0 + - xz=5.2.5=h516909a_1 + - zlib=1.2.11=h36c2ea0_1013 +prefix: /home/kcoyle/miniconda3/envs/stringtie diff --git a/envs/sv_annotation/sv_annotation-1.6.0.yaml b/envs/sv_annotation/sv_annotation-1.6.0.yaml new file mode 100644 index 000000000..e1f86913e --- /dev/null +++ b/envs/sv_annotation/sv_annotation-1.6.0.yaml @@ -0,0 +1,242 @@ +name: sva +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-annotationdbi=1.52.0 + - bioconductor-biobase=2.50.0 + - bioconductor-biocfilecache=1.14.0 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-biocparallel=1.24.1 + - bioconductor-biomart=2.46.3 + - bioconductor-biostrings=2.58.0 + - bioconductor-bsgenome=1.58.0 + - bioconductor-delayedarray=0.16.3 + - bioconductor-genomeinfodb=1.26.4 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicalignments=1.26.0 + - bioconductor-genomicfeatures=1.42.2 + - bioconductor-genomicranges=1.42.0 + - bioconductor-iranges=2.24.1 + - bioconductor-matrixgenerics=1.2.1 + - bioconductor-rhtslib=1.22.0 + - bioconductor-rsamtools=2.6.0 + - bioconductor-rtracklayer=1.50.0 + - bioconductor-s4vectors=0.28.1 + - bioconductor-structuralvariantannotation=1.6.0 + - bioconductor-summarizedexperiment=1.20.0 + - bioconductor-variantannotation=1.36.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.4.13 + - cairo=1.16.0 + - curl=7.76.1 + - font-ttf-dejavu-sans-mono=2.37 + - font-ttf-inconsolata=3.000 + - font-ttf-source-code-pro=2.038 + - font-ttf-ubuntu=0.83 + - fontconfig=2.13.1 + - fonts-conda-ecosystem=1 + - fonts-conda-forge=1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.21.0 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.14 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.1 + - icu=68.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.19.1 + - ld_impl_linux-64=2.35.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libcurl=7.76.1 + - libedit=3.1.20210216 + - libev=4.33 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.68.2 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.9.0 + - libnghttp2=1.43.0 + - libopenblas=0.3.15 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.3.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.14 + - libxml2=2.9.12 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1k + - pandoc=2.13 + - pango=1.48.5 + - pcre=8.44 + - pcre2=10.36 + - pixman=0.40.0 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.2.1 + - r-base=4.0.5 + - r-base64enc=0.1_3 + - r-bh=1.75.0_0 + - r-bit=4.0.4 + - r-bit64=4.0.5 + - r-bitops=1.0_7 + - r-blob=1.2.1 + - r-brio=1.1.2 + - r-broom=0.7.6 + - r-cachem=1.0.5 + - r-callr=3.7.0 + - r-cellranger=1.1.0 + - r-cli=2.5.0 + - r-clipr=0.7.1 + - r-colorspace=2.0_1 + - r-cpp11=0.2.7 + - r-crayon=1.4.1 + - r-curl=4.3.1 + - r-data.table=1.14.0 + - r-dbi=1.1.1 + - r-dbplyr=2.1.1 + - r-desc=1.3.0 + - r-diffobj=0.3.4 + - r-digest=0.6.27 + - r-dplyr=1.0.6 + - r-dtplyr=1.1.0 + - r-ellipsis=0.3.2 + - r-evaluate=0.14 + - r-fansi=0.4.2 + - r-farver=2.1.0 + - r-fastmap=1.1.0 + - r-forcats=0.5.1 + - r-formatr=1.9 + - r-fs=1.5.0 + - r-futile.logger=1.4.3 + - r-futile.options=1.0.1 + - r-gargle=1.1.0 + - r-generics=0.1.0 + - r-ggplot2=3.3.3 + - r-glue=1.4.2 + - r-googledrive=1.0.1 + - r-googlesheets4=0.3.0 + - r-gtable=0.3.0 + - r-haven=2.4.1 + - r-highr=0.9 + - r-hms=1.1.0 + - r-htmltools=0.5.1.1 + - r-httr=1.4.2 + - r-ids=1.0.1 + - r-isoband=0.2.4 + - r-jsonlite=1.7.2 + - r-knitr=1.33 + - r-labeling=0.4.2 + - r-lambda.r=1.2.4 + - r-lattice=0.20_44 + - r-lifecycle=1.0.0 + - r-lubridate=1.7.10 + - r-magrittr=2.0.1 + - r-markdown=1.1 + - r-mass=7.3_54 + - r-matrix=1.3_3 + - r-matrixstats=0.58.0 + - r-memoise=2.0.0 + - r-mgcv=1.8_35 + - r-mime=0.10 + - r-modelr=0.1.8 + - r-munsell=0.5.0 + - r-nlme=3.1_152 + - r-openssl=1.4.4 + - r-pillar=1.6.1 + - r-pkgconfig=2.0.3 + - r-pkgload=1.2.1 + - r-plogr=0.2.0 + - r-plyr=1.8.6 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.5.2 + - r-progress=1.2.2 + - r-ps=1.6.0 + - r-purrr=0.3.4 + - r-r6=2.5.0 + - r-rappdirs=0.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.3 + - r-readr=1.4.0 + - r-readxl=1.3.1 + - r-rematch=1.0.1 + - r-rematch2=2.1.2 + - r-reprex=2.0.0 + - r-reshape2=1.4.4 + - r-rlang=0.4.11 + - r-rmarkdown=2.8 + - r-rprojroot=2.0.2 + - r-rsqlite=2.2.5 + - r-rstudioapi=0.13 + - r-rvest=1.0.0 + - r-scales=1.1.1 + - r-selectr=0.4_2 + - r-snow=0.4_3 + - r-stringi=1.6.2 + - r-stringr=1.4.0 + - r-sys=3.4 + - r-testthat=3.0.2 + - r-tibble=3.1.2 + - r-tidyr=1.1.3 + - r-tidyselect=1.1.1 + - r-tidyverse=1.3.1 + - r-tinytex=0.31 + - r-utf8=1.2.1 + - r-uuid=0.1_4 + - r-vctrs=0.3.8 + - r-viridislite=0.4.0 + - r-waldo=0.2.5 + - r-withr=2.4.2 + - r-xfun=0.23 + - r-xml=3.99_0.6 + - r-xml2=1.3.2 + - r-yaml=2.2.1 + - r-zeallot=0.1.0 + - readline=8.1 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.1 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.5.0 +prefix: /home/lhilton/miniconda3/envs/sva diff --git a/images/module_levels.png b/images/module_levels.png index f1cbbe41f..15a362aa6 100644 Binary files a/images/module_levels.png and b/images/module_levels.png differ diff --git a/modules/bam2fastq/1.2/bam2fastq.smk b/modules/bam2fastq/1.2/bam2fastq.smk index 43e0a244d..161cd387a 100644 --- a/modules/bam2fastq/1.2/bam2fastq.smk +++ b/modules/bam2fastq/1.2/bam2fastq.smk @@ -15,6 +15,27 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bam2fastq"]` CFG = op.setup_module( @@ -57,7 +78,7 @@ rule _bam2fastq_input_bam: output: bam = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bam" run: - op.relative_symlink(input, output.bam) + op.absolute_symlink(input, output.bam) # Conditional rules depending on whether or not fastq outputs will be temporary @@ -126,8 +147,8 @@ rule _bam2fastq_output: fastq_1 = CFG["dirs"]["outputs"] + "{seq_type}/{sample_id}.read1.fastq.gz", fastq_2 = CFG["dirs"]["outputs"] + "{seq_type}/{sample_id}.read2.fastq.gz" run: - op.relative_symlink(input.fastq_1, output.fastq_1) - op.relative_symlink(input.fastq_2, output.fastq_2) + op.relative_symlink(input.fastq_1, output.fastq_1, in_module = True) + op.relative_symlink(input.fastq_2, output.fastq_2, in_module = True) rule _bam2fastq_all: diff --git a/modules/bam2fastq/1.2/bam2fastq_grouped.smk b/modules/bam2fastq/1.2/bam2fastq_grouped.smk index df5f56572..741ff1a42 100644 --- a/modules/bam2fastq/1.2/bam2fastq_grouped.smk +++ b/modules/bam2fastq/1.2/bam2fastq_grouped.smk @@ -15,6 +15,27 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bam2fastq"]` CFG = op.setup_module( diff --git a/modules/battenberg/1.0/battenberg.smk b/modules/battenberg/1.0/battenberg.smk index b2da0809d..c2a48d825 100644 --- a/modules/battenberg/1.0/battenberg.smk +++ b/modules/battenberg/1.0/battenberg.smk @@ -15,6 +15,26 @@ import oncopipe as op import glob +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["battenberg"]` CFG = op.setup_module( @@ -47,8 +67,8 @@ rule _battenberg_input_bam: bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai" run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bam + ".bai", output.bai) + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam + ".bai", output.bai) # Installs the Battenberg R dependencies and associated software (impute2, alleleCounter) # Currently I think this rule has to be run twice for it to work properly because the conda environment is created here. @@ -158,9 +178,9 @@ rule _battenberg_output_seg: plots = glob.glob(params.batt_dir + "/*.png") for png in plots: bn = os.path.basename(png) - op.relative_symlink(png, params.png_dir + "/" + bn) - op.relative_symlink(input.seg, output.seg) - op.relative_symlink(input.sub, output.sub) + op.relative_symlink(png, params.png_dir + "/" + bn, in_module = True) + op.relative_symlink(input.seg, output.seg, in_module = True) + op.relative_symlink(input.sub, output.sub, in_module = True) # Generates the target sentinels for each run, which generate the symlinks rule _battenberg_all: diff --git a/modules/battenberg/1.0/config/default.yaml b/modules/battenberg/1.0/config/default.yaml index f69b5d001..647d82bb1 100644 --- a/modules/battenberg/1.0/config/default.yaml +++ b/modules/battenberg/1.0/config/default.yaml @@ -6,7 +6,7 @@ lcr-modules: sample_bam: "__UPDATE__" battenberg_script: "{MODSDIR}/src/R/battenberg_wgs_hg38.R" calc_sex_status: "{MODSDIR}/src/bash/calc_sex_status.sh" - cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.0/cnv2igv.py" + cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" #TODO: this should be tested with v1.2 of cnv2igv.py scratch_subdirectories: [] diff --git a/modules/battenberg/1.1/config/default.yaml b/modules/battenberg/1.1/config/default.yaml index 384415efc..5ae3a3179 100644 --- a/modules/battenberg/1.1/config/default.yaml +++ b/modules/battenberg/1.1/config/default.yaml @@ -5,7 +5,7 @@ lcr-modules: # Available wildcards: {seq_type} {genome_build} {sample_id} sample_bam: "__UPDATE__" battenberg_script: "{MODSDIR}/src/battenberg_wgs_hg38.R" - cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.3/cnv2igv.py" + cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" src_dir: "{MODSDIR}/src/" scratch_subdirectories: [] diff --git a/modules/battenberg/1.2/battenberg.smk b/modules/battenberg/1.2/battenberg.smk new file mode 100644 index 000000000..370c80339 --- /dev/null +++ b/modules/battenberg/1.2/battenberg.smk @@ -0,0 +1,297 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: Ryan Morin +# Module Author: Ryan Morin +# Contributors: N/A + +##### SETUP ##### + + +# Import package with useful functions for developing analysis modules +import oncopipe as op +import glob + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["battenberg"]` +CFG = op.setup_module( + name = "battenberg", + version = "1.2", + subdirectories = ["inputs", "infer_sex","battenberg", "outputs"], +) + +#set variable for prepending to PATH based on config +BATTENBERG_SCRIPT_PATH = CFG['inputs']['src_dir'] +#this is used in place of the shell.prefix() because that was not working consistently. This is not ideal. + +#this preserves the variable when using lambda functions +_battenberg_CFG = CFG + +# Define rules to be run locally when using a compute cluster +localrules: + _battenberg_all + +BATTENBERG_VERSION_MAP = { + "hg19": "grch37", + "grch37": "grch37", + "hs37d5": "grch37", + "hg38": "hg38", + "grch38": "hg38", + "grch38-legacy": "hg38" + +} + +##### RULES ##### + +# Downloads the reference files into the module results directory (under '00-inputs/') from https://www.bcgsc.ca/downloads/morinlab/reference/ . +rule _battenberg_get_reference: + output: + battenberg_impute = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_impute_v3"), + impute_info = CFG["dirs"]["inputs"] + "reference/{genome_build}/impute_info.txt", + probloci = CFG["dirs"]["inputs"] + "reference/{genome_build}/probloci.txt.gz", + battenberg_wgs_replic_correction = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_wgs_replic_correction_1000g_v3"), + battenberg_gc_correction = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_wgs_gc_correction_1000g_v3"), + genomesloci = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_1000genomesloci2012_v3") + params: + url = "https://www.bcgsc.ca/downloads/morinlab/reference", + alt_build = lambda w: BATTENBERG_VERSION_MAP[w.genome_build], + folder = CFG["dirs"]["inputs"] + "reference/{genome_build}", + build = "{genome_build}", + battenberg_path = CFG['inputs']['src_dir'] + resources: + **CFG["resources"]["reference"] + threads: + CFG["threads"]["reference"] + shell: + op.as_one_line(""" + wget -qO- {params.url}/battenberg_impute_{params.alt_build}.tar.gz | + tar -xvz > {output.battenberg_impute} -C {params.folder} + && + wget -qO- {params.url}/battenberg_{params.alt_build}_gc_correction.tar.gz | + tar -xvz > {output.battenberg_gc_correction} -C {params.folder} + && + wget -qO- {params.url}/battenberg_1000genomesloci_{params.alt_build}.tar.gz | + tar -xvz > {output.genomesloci} -C {params.folder} + && + wget -O {output.impute_info} {params.url}/impute_info_{params.alt_build}.txt + && + python {params.battenberg_path}/reference_correction.py {params.build} $(dirname $(readlink -f {output.impute_info})) + && + wget -qO- {params.url}/battenberg_{params.alt_build}_replic_correction.tar.gz | + tar -xvz > {output.battenberg_wgs_replic_correction} -C {params.folder} + && + wget -O {output.probloci} {params.url}/probloci_{params.alt_build}.txt.gz + + """) + + +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _battenberg_input_bam: + input: + bam = CFG["inputs"]["sample_bam"] + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai" + group: "setup_run" + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam + ".bai", output.bai) + op.absolute_symlink(input.bam + ".bai", output.crai) + +# Installs the Battenberg R dependencies and associated software (impute2, alleleCounter) +# Currently I think this rule has to be run twice for it to work properly because the conda environment is created here. +# I am open to suggestions for how to get around this. +rule _install_battenberg: + output: + complete = CFG["dirs"]["inputs"] + "battenberg_dependencies_installed.success" + conda: + CFG["conda_envs"]["battenberg"] + log: + input = CFG["logs"]["inputs"] + "input.log" + shell: + """ + R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' >> {log.input} && ##move some of this to config? + R -q -e 'devtools::install_github("morinlab/battenberg")' >> {log.input} && ##move some of this to config? + touch {output.complete}""" + +# this process is very fast on bam files and painfully slow on cram files. +# The result of calc_sex_status.sh is stored in a file to avoid having to rerun it unnecessarily +rule _infer_patient_sex: + input: + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") + output: sex_result = CFG["dirs"]["infer_sex"] + "{seq_type}--{genome_build}/{normal_id}.sex" + resources: + **CFG["resources"]["infer_sex"] + log: + stderr = CFG["logs"]["infer_sex"] + "{seq_type}--{genome_build}/{normal_id}_infer_sex_stderr.log" + conda: + CFG["conda_envs"]["samtools"] + group: "setup_run" + threads: 8 + shell: + op.as_one_line(""" + PATH={BATTENBERG_SCRIPT_PATH}:$PATH; + echo "running {rule} for {wildcards.normal_id} on $(hostname) at $(date)" > {log.stderr} ; + calc_sex_status.sh {input.normal_bam} {input.fasta} {wildcards.normal_id} > {output.sex_result} 2>> {log.stderr} && + echo "DONE running {rule} for {wildcards.normal_id} on $(hostname) at $(date)" >> {log.stderr} + """) + + +# This rule runs the entire Battenberg pipeline. Eventually we may want to set this rule up to allow re-starting +# of partially completed jobs (e.g. if they run out of RAM and are killed by the cluster, they can automatically retry) +rule _run_battenberg: + input: + tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + installed = CFG["dirs"]["inputs"] + "battenberg_dependencies_installed.success", + sex_result = CFG["dirs"]["infer_sex"] + "{seq_type}--{genome_build}/{normal_id}.sex", + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), + impute_info = str(rules._battenberg_get_reference.output.impute_info) + + output: + refit=CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_refit_suggestion.txt", + sub=CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_subclones.txt", + ac=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_alleleCounts.tab"), + mb=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_mutantBAF.tab"), + mlrg=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_mutantLogR_gcCorrected.tab"), + mlr=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_mutantLogR.tab"), + nlr=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_normalLogR.tab"), + nb=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_normalBAF.tab"), + cp=CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_cellularity_ploidy.txt" + log: + stdout = CFG["logs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_battenberg.stdout.log", + stderr = CFG["logs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_battenberg.stderr.log" + params: + fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), + script = CFG["inputs"]["battenberg_script"], + out_dir = CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}", + ref = CFG["dirs"]["inputs"] + "reference/{genome_build}" + conda: + CFG["conda_envs"]["battenberg"] + resources: + **CFG["resources"]["battenberg"] + threads: + CFG["threads"]["battenberg"] + shell: + op.as_one_line(""" + if [[ $(head -c 4 {params.fasta}) == ">chr" ]]; then chr_prefixed='true'; else chr_prefixed='false'; fi; + echo "$chr_prefixed" + echo "running {rule} for {wildcards.tumour_id}--{wildcards.normal_id} on $(hostname) at $(date)" > {log.stdout}; + sex=$(cut -f 4 {input.sex_result}| tail -n 1); + echo "setting sex as $sex"; + Rscript {params.script} -t {wildcards.tumour_id} + -n {wildcards.normal_id} --tb $(readlink -f {input.tumour_bam}) --nb $(readlink -f {input.normal_bam}) -f {input.fasta} --reference $(readlink -f {params.ref}) + -o {params.out_dir} --chr_prefixed_genome $chr_prefixed --sex $sex --cpu {threads} >> {log.stdout} 2>> {log.stderr} && + echo "DONE {rule} for {wildcards.tumour_id}--{wildcards.normal_id} on $(hostname) at $(date)" >> {log.stdout}; + """) + + +# Convert the subclones.txt (best fit) to igv-friendly SEG files. +rule _battenberg_to_igv_seg: + input: + sub = rules._run_battenberg.output.sub, + cnv2igv = CFG["inputs"]["cnv2igv"] + output: + seg = CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_subclones.igv.seg" + log: + stderr = CFG["logs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_seg2igv.stderr.log" + threads: 1 + group: "post_process" + shell: + op.as_one_line(""" + echo "running {rule} for {wildcards.tumour_id}--{wildcards.normal_id} on $(hostname) at $(date)" > {log.stderr}; + python {input.cnv2igv} --mode battenberg --sample {wildcards.tumour_id} + {input.sub} > {output.seg} 2>> {log.stderr} + """) + + +#due to the large number of files (several per chromosome) that are not explicit outputs, do some glob-based cleaning in the output directory +rule _battenberg_cleanup: + input: + rules._battenberg_to_igv_seg.output.seg + output: + complete = CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_cleanup_complete.txt" + group: "post_process" + shell: + op.as_one_line(""" + d=$(dirname {output}); + rm -f $d/*impute_input* && + rm -f $d/*alleleFrequencies* && + rm -f $d/*aplotype* && + rm -f $d/*BAFsegmented* && + touch {output.complete} + """) + +# Symlinks the final output files into the module results directory (under '99-outputs/') +# All plots generated by Battenberg are symlinked using a glob for convenience + +rule _battenberg_output_seg: + input: + seg = rules._battenberg_to_igv_seg.output.seg, + sub = rules._run_battenberg.output.sub, + cp = rules._run_battenberg.output.cp + output: + seg = CFG["dirs"]["outputs"] + "seg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.igv.seg", + sub = CFG["dirs"]["outputs"] + "txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt", + cp = CFG["dirs"]["outputs"] + "txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt" + params: + batt_dir = CFG["dirs"]["battenberg"] + "/{seq_type}--{genome_build}/{tumour_id}--{normal_id}", + png_dir = CFG["dirs"]["outputs"] + "png/{seq_type}--{genome_build}" + group: "post_process" + run: + plots = glob.glob(params.batt_dir + "/*.png") + for png in plots: + bn = os.path.basename(png) + op.relative_symlink(png, params.png_dir + "/" + bn,in_module=True) + op.relative_symlink(input.seg, output.seg,in_module=True) + op.relative_symlink(input.sub, output.sub,in_module=True) + op.relative_symlink(input.cp, output.cp,in_module=True) + +# Generates the target sentinels for each run, which generate the symlinks +rule _battenberg_all: + input: + expand( + [ + rules._run_battenberg.output.sub, + rules._battenberg_output_seg.output.seg, + rules._battenberg_cleanup.output.complete + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"]) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/battenberg/1.2/config/default.yaml b/modules/battenberg/1.2/config/default.yaml new file mode 100644 index 000000000..85c65aa64 --- /dev/null +++ b/modules/battenberg/1.2/config/default.yaml @@ -0,0 +1,40 @@ +lcr-modules: + battenberg: + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + battenberg_script: "{MODSDIR}/src/battenberg_wgs_hg38.R" + cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" + src_dir: "{MODSDIR}/src/" + + scratch_subdirectories: [] + + conda_envs: + battenberg: "{MODSDIR}/envs/battenberg-1.1.yaml" + wget: "{MODSDIR}/envs/wget-1.20.1.yaml" + samtools: "{MODSDIR}/envs/samtools-1.9.yaml" + + resources: + battenberg: + mem_mb: 200000 + bam: 1 + infer_sex: + mem_mb: 20000 + bam: 1 + reference: + mem_mb: 8000 + + threads: + battenberg: 24 + reference: 2 + #ideal for processing all chromosomes at once + + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: null + run_paired_tumours_as_unpaired: False + capture: + run_paired_tumours: True + run_unpaired_tumours_with: null + run_paired_tumours_as_unpaired: False diff --git a/modules/battenberg/1.2/envs/battenberg-1.1.yaml b/modules/battenberg/1.2/envs/battenberg-1.1.yaml new file mode 100644 index 000000000..e6f3a0dab --- /dev/null +++ b/modules/battenberg/1.2/envs/battenberg-1.1.yaml @@ -0,0 +1,222 @@ +channels: + - conda-forge + - bioconda + - r + - defaults +dependencies: + - r-biocmanager=1.30.10 + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.34 + - binutils_linux-64=2.34 + - bwidget=1.9.14 + - bzip2=1.0.8 + - ca-certificates=2020.4.5.1 + - cairo=1.16.0 + - cancerit-allelecount=4.0.2 + - certifi=2020.4.5.1 + - curl=7.69.1 + - fontconfig=2.13.1 + - freetype=2.10.1 + - fribidi=1.0.9 + - gcc_impl_linux-64=7.3.0 + - gcc_linux-64=7.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=7.3.0 + - gfortran_linux-64=7.3.0 + - glib=2.64.2 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=7.3.0 + - gxx_linux-64=7.3.0 + - harfbuzz=2.4.0 + - htslib=1.9 + - icu=64.2 + - impute2=2.3.2 + - jpeg=9c + - krb5=1.17.1 + - ld_impl_linux-64=2.34 + - libblas=3.8.0 + - libcblas=3.8.0 + - libcurl=7.69.1 + - libdeflate=1.2 + - libedit=3.1.20170329 + - libffi=3.2.1 + - libgcc-ng=9.2.0 + - libgfortran-ng=7.3.0 + - libgomp=9.2.0 + - libiconv=1.15 + - liblapack=3.8.0 + - libopenblas=0.3.9 + - libpng=1.6.37 + - libssh2=1.8.2 + - libstdcxx-ng=9.2.0 + - libtiff=4.1.0 + - libuuid=2.32.1 + - libwebp-base=1.1.0 + - libxcb=1.13 + - libxml2=2.9.10 + - llvm-openmp=10.0.0 + - lz4-c=1.9.2 + - make=4.3 + - ncurses=6.1 + - openssl=1.1.1g + - pandoc=2.9.2.1 + - pango=1.42.4 + - parallel=20200322 + - pcre=8.44 + - perl=5.26.2 + - pip=20.1 + - pixman=0.38.0 + - pthread-stubs=0.4 + - python=3.8.2 + - python_abi=3.8 + - r-askpass=1.1 + - r-assertthat=0.2.1 + - r-backports=1.1.6 + - r-base=3.6.3 + - r-base64enc=0.1_3 + - r-brew=1.0_6 + - r-broom=0.5.6 + - r-callr=3.4.3 + - r-cellranger=1.1.0 + - r-cli=2.0.2 + - r-clipr=0.7.0 + - r-codetools=0.2_16 + - r-colorspace=1.4_1 + - r-commonmark=1.7 + - r-covr=3.5.0 + - r-crayon=1.3.4 + - r-crosstalk=1.1.0.1 + - r-curl=4.3 + - r-dbi=1.1.0 + - r-dbplyr=1.4.3 + - r-desc=1.2.0 + - r-devtools=2.3.0 + - r-digest=0.6.25 + - r-doparallel=1.0.15 + - r-dplyr=0.8.5 + - r-dt=0.13 + - r-ellipsis=0.3.0 + - r-evaluate=0.14 + - r-fansi=0.4.1 + - r-farver=2.0.3 + - r-forcats=0.5.0 + - r-foreach=1.5.0 + - r-fs=1.4.1 + - r-generics=0.0.2 + - r-getopt=1.20.3 + - r-ggplot2=3.3.0 + - r-gh=1.1.0 + - r-git2r=0.26.1 + - r-glue=1.4.0 + - r-gridextra=2.3 + - r-gtable=0.3.0 + - r-gtools=3.8.2 + - r-haven=2.2.0 + - r-highr=0.8 + - r-hms=0.5.3 + - r-htmltools=0.4.0 + - r-htmlwidgets=1.5.1 + - r-httr=1.4.1 + - r-ini=0.3.1 + - r-isoband=0.2.1 + - r-iterators=1.0.12 + - r-jsonlite=1.6.1 + - r-knitr=1.28 + - r-labeling=0.3 + - r-later=1.0.0 + - r-lattice=0.20_41 + - r-lazyeval=0.2.2 + - r-lifecycle=0.2.0 + - r-lubridate=1.7.8 + - r-magrittr=1.5 + - r-markdown=1.1 + - r-mass=7.3_51.6 + - r-matrix=1.2_18 + - r-memoise=1.1.0 + - r-mgcv=1.8_31 + - r-mime=0.9 + - r-modelr=0.1.6 + - r-munsell=0.5.0 + - r-nlme=3.1_147 + - r-openssl=1.4.1 + - r-optparse=1.6.6 + - r-pillar=1.4.3 + - r-pkgbuild=1.0.7 + - r-pkgconfig=2.0.3 + - r-pkgload=1.0.2 + - r-plogr=0.2.0 + - r-plyr=1.8.6 + - r-praise=1.0.0 + - r-prettyunits=1.1.1 + - r-processx=3.4.2 + - r-progress=1.2.2 + - r-promises=1.1.0 + - r-ps=1.3.2 + - r-purrr=0.3.4 + - r-r6=2.4.1 + - r-rcmdcheck=1.3.3 + - r-rcolorbrewer=1.1_2 + - r-rcpp=1.0.4.6 + - r-readr=1.3.1 + - r-readxl=1.3.1 + - r-rematch=1.0.1 + - r-rematch2=2.1.1 + - r-remotes=2.1.1 + - r-reprex=0.3.0 + - r-reshape2=1.4.4 + - r-rex=1.2.0 + - r-rlang=0.4.5 + - r-rmarkdown=2.1 + - r-roxygen2=7.1.0 + - r-rprojroot=1.3_2 + - r-rstudioapi=0.11 + - r-rversions=2.0.1 + - r-rvest=0.3.5 + - r-scales=1.1.0 + - r-selectr=0.4_2 + - r-sessioninfo=1.1.1 + - r-stringi=1.4.6 + - r-stringr=1.4.0 + - r-sys=3.3 + - r-testthat=2.3.2 + - r-tibble=3.0.1 + - r-tidyr=1.0.2 + - r-tidyselect=1.0.0 + - r-tidyverse=1.3.0 + - r-tinytex=0.22 + - r-usethis=1.6.1 + - r-utf8=1.1.4 + - r-vctrs=0.2.4 + - r-viridislite=0.3.0 + - r-whisker=0.4 + - r-withr=2.2.0 + - r-xfun=0.13 + - r-xml2=1.3.2 + - r-xopen=1.0.0 + - r-yaml=2.2.1 + - r-zeallot=0.1.0 + - readline=8.0 + - sed=4.7 + - setuptools=46.1.3 + - sqlite=3.30.1 + - tk=8.6.10 + - tktable=2.10 + - wheel=0.34.2 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.6.9 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.4 + diff --git a/modules/battenberg/1.2/envs/samtools-1.9.yaml b/modules/battenberg/1.2/envs/samtools-1.9.yaml new file mode 120000 index 000000000..ab29288bb --- /dev/null +++ b/modules/battenberg/1.2/envs/samtools-1.9.yaml @@ -0,0 +1 @@ +../../../../envs/samtools/samtools-1.9.yaml \ No newline at end of file diff --git a/modules/battenberg/1.2/envs/wget-1.20.1.yaml b/modules/battenberg/1.2/envs/wget-1.20.1.yaml new file mode 120000 index 000000000..86501e72a --- /dev/null +++ b/modules/battenberg/1.2/envs/wget-1.20.1.yaml @@ -0,0 +1 @@ +../../../../envs/wget/wget-1.20.1.yaml \ No newline at end of file diff --git a/modules/battenberg/1.2/schemas/base-1.0.yaml b/modules/battenberg/1.2/schemas/base-1.0.yaml new file mode 120000 index 000000000..0a69d1ceb --- /dev/null +++ b/modules/battenberg/1.2/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/battenberg/1.2/src/battenberg_wgs_hg38.R b/modules/battenberg/1.2/src/battenberg_wgs_hg38.R new file mode 100755 index 000000000..5f3a27074 --- /dev/null +++ b/modules/battenberg/1.2/src/battenberg_wgs_hg38.R @@ -0,0 +1,135 @@ +library(Battenberg) +library(optparse) +#source("./src/R/battenberg/R/clonal_ascat.R") +#source("./src/R/battenberg/R/impute.R") +#devtools::load_all(path="/projects/rmorin/projects/gambl-repos/gambl-rmorin/src/R/battenberg") +#source("/projects/rmorin/projects/gambl-repos/gambl-rmorin/src/R/prepare_wgs.R") +option_list = list( + make_option(c("-t", "--tumourname"), type="character", default=NULL, help="Samplename of the tumour", metavar="character"), + make_option(c("-n", "--normalname"), type="character", default=NULL, help="Samplename of the normal", metavar="character"), + make_option(c("--tb"), type="character", default=NULL, help="Tumour BAM file", metavar="character"), + make_option(c("--nb"), type="character", default=NULL, help="Normal BAM file", metavar="character"), + make_option(c("--sex"), type="character", default=NULL, help="Sex of the sample", metavar="character"), + make_option(c("-o", "--output"), type="character", default=NULL, help="Directory where output will be written", metavar="character"), + make_option(c("--skip_allelecount"), type="logical", default=FALSE, action="store_true", help="Provide when alleles don't have to be counted. This expects allelecount files on disk", metavar="character"), + make_option(c("--skip_preprocessing"), type="logical", default=FALSE, action="store_true", help="Provide when pre-processing has previously completed. This expects the files on disk", metavar="character"), + make_option(c("--skip_phasing"), type="logical", default=FALSE, action="store_true", help="Provide when phasing has previously completed. This expects the files on disk", metavar="character"), + make_option(c("--cpu"), type="numeric", default=8, help="The number of CPU cores to be used by the pipeline (Default: 8)", metavar="character"), + make_option(c("--bp"), type="character", default=NULL, help="Optional two column file (chromosome and position) specifying prior breakpoints to be used during segmentation", metavar="character"), + make_option(c("--reference"), type="character", default=NULL, help="Path to reference file", metavar="character"), + make_option(c("-f","--reference_fasta"), type="character", default=NULL, help="Path to indexed genome fasta file (needed for CRAM compatability)", metavar="character"), + make_option(c("--chr_prefixed_genome"), type="logical", default=FALSE, action="store_true", help="Flag to specify if the genome has chr prefixes in chromosome names", metavar="character"), + make_option(c("--impute_log"), type="character", default="./", help="Full path for where to store impute logs. If blank, these will be written to the main output directory and cleared.") +) + +opt_parser = OptionParser(option_list=option_list) +opt = parse_args(opt_parser) +original_dir = getwd() + +REFERENCE_BASE = opt$reference +TUMOURNAME = opt$tumourname +NORMALNAME = opt$normalname + +REFERENCE_FASTA = opt$reference_fasta +print(paste("using fasta:",REFERENCE_FASTA)) +IS.MALE = opt$sex=="male" | opt$sex=="Male" +RUN_DIR = opt$o +CHR_PREFIXED = opt$chr_prefixed_genome +print(paste("chr prefix present?",CHR_PREFIXED)) +SKIP_ALLELECOUNTING = opt$skip_allelecount +SKIP_PREPROCESSING = opt$skip_preprocessing +SKIP_PHASING = opt$skip_phasing +NTHREADS = opt$cpu +PRIOR_BREAKPOINTS_FILE = opt$bp +IMPUTE_LOG = opt$impute_log +verbose = TRUE +############################################################################### +# 2018-11-01 +# A pure R Battenberg v2.2.9 WGS pipeline implementation. +# sd11 [at] sanger.ac.uk +############################################################################### + +# General static +IMPUTEINFOFILE = paste0(REFERENCE_BASE,"/impute_info.txt") +print(IMPUTEINFOFILE) +G1000PREFIX = paste0(REFERENCE_BASE,"/battenberg_1000genomesloci2012_v3/1000genomesAlleles2012_chr") +G1000PREFIX_AC = paste0(REFERENCE_BASE,"/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr") +GCCORRECTPREFIX = paste0(REFERENCE_BASE,"/battenberg_wgs_gc_correction_1000g_v3/1000_genomes_GC_corr_chr_") +REPLICCORRECTPREFIX = paste0(REFERENCE_BASE,"/battenberg_wgs_replic_correction_1000g_v3/1000_genomes_replication_timing_chr_") +IMPUTE_EXE = "impute2" #install using conda + +PLATFORM_GAMMA = 1 +PHASING_GAMMA = 1 +SEGMENTATION_GAMMA = 10 +SEGMENTATIIN_KMIN = 3 +PHASING_KMIN = 1 +CLONALITY_DIST_METRIC = 0 +ASCAT_DIST_METRIC = 1 +MIN_PLOIDY = 1.6 +MAX_PLOIDY = 4.8 +MIN_RHO = 0.1 +MIN_GOODNESS_OF_FIT = 0.63 +BALANCED_THRESHOLD = 0.51 +MIN_NORMAL_DEPTH = 10 +MIN_BASE_QUAL = 20 +MIN_MAP_QUAL = 35 +CALC_SEG_BAF_OPTION = 3 + +# WGS specific static +ALLELECOUNTER = "alleleCounter" #conda package that should have this: cancerit-allelecount +PROBLEMLOCI = paste0(REFERENCE_BASE, "/probloci.txt.gz") + +print(PROBLEMLOCI); + +# Change to work directory and load the chromosome information +setwd(RUN_DIR) +NORMALBAM = opt$nb +TUMOURBAM = opt$tb + + +#this should be the full path to the files after changing directories + +#debugging lines added here: +#SKIP_ALLELECOUNTING = TRUE +#SKIP_PREPROCESSING = TRUE +#SKIP_PHASING = FALSE + +battenberg(tumourname=TUMOURNAME, + normalname=NORMALNAME, + tumour_data_file=TUMOURBAM, + normal_data_file=NORMALBAM, + ismale=IS.MALE, + imputeinfofile=IMPUTEINFOFILE, + g1000prefix=G1000PREFIX, + g1000allelesprefix=G1000PREFIX_AC, + gccorrectprefix=GCCORRECTPREFIX, + repliccorrectprefix=REPLICCORRECTPREFIX, + problemloci=PROBLEMLOCI, + data_type="wgs", + impute_exe=IMPUTE_EXE, + allelecounter_exe=ALLELECOUNTER, + nthreads=NTHREADS, + platform_gamma=PLATFORM_GAMMA, + phasing_gamma=PHASING_GAMMA, + segmentation_gamma=SEGMENTATION_GAMMA, + segmentation_kmin=SEGMENTATIIN_KMIN, + phasing_kmin=PHASING_KMIN, + clonality_dist_metric=CLONALITY_DIST_METRIC, + ascat_dist_metric=ASCAT_DIST_METRIC, + min_ploidy=MIN_PLOIDY, + max_ploidy=MAX_PLOIDY, + min_rho=MIN_RHO, + min_goodness=MIN_GOODNESS_OF_FIT, + uninformative_BAF_threshold=BALANCED_THRESHOLD, + min_normal_depth=MIN_NORMAL_DEPTH, + min_base_qual=MIN_BASE_QUAL, + min_map_qual=MIN_MAP_QUAL, + calc_seg_baf_option=CALC_SEG_BAF_OPTION, + skip_allele_counting=SKIP_ALLELECOUNTING, + skip_preprocessing=SKIP_PREPROCESSING, + skip_phasing=SKIP_PHASING, + prior_breakpoints_file=PRIOR_BREAKPOINTS_FILE, + chr_prefixed=CHR_PREFIXED, + verbose=verbose, + logfile_prefix=IMPUTE_LOG, + ref_fasta=REFERENCE_FASTA) diff --git a/modules/battenberg/1.2/src/calc_sex_status.sh b/modules/battenberg/1.2/src/calc_sex_status.sh new file mode 100755 index 000000000..3d750a05b --- /dev/null +++ b/modules/battenberg/1.2/src/calc_sex_status.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +# Infer sex of a patient from a normal genome bam using the ratio of X and Y chromosome reads +# If provided, the names of chrX and chrY will be used, otherwise they will be inferred from the header (this is not guaranteed to work) + +set -euf -o pipefail + +BAM="$1" +REF="$2" +SAMPLE="${3:-UNKNOWN}" +X_CHROM="${4:-MISSING}" +Y_CHROM="${5:-MISSING}" +DEBUG="${6:-MISSING}" + +if [[ $X_CHROM == "MISSING" ]] +then + X_CHROM=$(samtools view -H ${BAM} |\ + sed -r 's/\S+:\S+/\n&/g' | perl -ne 's/\s+//g;print "$_\n"' | awk 'BEGIN{FS=":"} $1=="SN" && $2 ~ /X$/ {print $2}') +fi +if [[ $Y_CHROM == "MISSING" ]] +then + Y_CHROM=$(samtools view -H ${BAM} |\ + sed -r 's/\S+:\S+/\n&/g' | perl -ne 's/\s+//g;print "$_\n"' | awk 'BEGIN{FS=":"} $1=="SN" && $2 ~ /Y$/ {print $2}') +fi +if [[ ! $DEBUG == "MISSING" ]] +then + echo "DEBUG: x chromosome is named >$X_CHROM< and y chromosome is named >$Y_CHROM<" +fi + + +X_READS=$(samtools view -@ 8 -T $REF $BAM $X_CHROM | wc -l) +Y_READS=$(samtools view -@ 8 -T $REF $BAM $Y_CHROM | wc -l) + + +ratio=$((100 * $Y_READS/$X_READS)) +sex="female" +if [[ $ratio -gt 10 ]] +then + sex="male" +fi +printf "sample\tchrX_count\tchrY_count\tsex\n" +printf "$SAMPLE\t$X_READS\t$Y_READS\t$sex\n" diff --git a/modules/battenberg/1.2/src/reference_correction.py b/modules/battenberg/1.2/src/reference_correction.py new file mode 100644 index 000000000..be482fe75 --- /dev/null +++ b/modules/battenberg/1.2/src/reference_correction.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + + +##### ATTRIBUTION ##### + + +# Original Author: Lakshay Sethi + +### Battenberg refrence file corrector ### +# Replaces the placeholder value in the impute_info.txt with the correct path +# where the reference files downloaded are stored. + +# +# Usage: +# python /reference_correction.py +# +# Notes: +# This script is intended for use with the Battenberg-1.2 module in LCR-modules. +# It expects to find the genome build at the input path, following +# the pattern reference_correction.py {genome_build}. These files should be in the +# 00-inputs subdirectory of the battenberg-1.2 directory present in the results directory. +# +# The file is made to be present in the src sub directory of the module. +# +# The sample table should adhere to LCR-modules guidelines. + +import os +import sys + +cwd = sys.argv[2] + +fileIN = open( + cwd + + "/impute_info.txt", + "r", +) +filedata = fileIN.read() +fileIN.close() + +newdata = filedata.replace( + "", + cwd + + "/battenberg_impute_v3", +) + +fileOut = open( + cwd + + "/impute_info.txt", + "w", +) +fileOut.write(newdata) +fileOut.close() + diff --git a/modules/battenberg/CHANGELOG.md b/modules/battenberg/CHANGELOG.md index b2797d359..d42c99386 100644 --- a/modules/battenberg/CHANGELOG.md +++ b/modules/battenberg/CHANGELOG.md @@ -6,6 +6,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.2] - 2021-04-23 + +This release was authored by Lakshay Sethi. +This overhaul began with addition of an automatic way to download reference files, as a way to handle the grievances +from version 1.0. To achieve this I created a new rule, _battenberg_get_reference, through which I downloaded the files from GSC web portal and used a script called reference_correction.py, to automatically replace placeholders with the correct paths. After this I automated the way chr prefix are used, using a regex statement to directly read it from the genome.fa file of that respective genome_build. To increase the scalability of Battenberg, ability to use different genomes where added by making a VERSION_MAP dictionary. Then to make the log files more informative, I shifted the output generated by rule _install_battenberg from terminal to log file called input.log. +As a result of both reference file downloading and chr prefix reading going automatic, variables reference_path and chr_prefixed_reference from config file were removed. + ## [1.1] - 2020-12-22 This release was authored by Ryan Morin. diff --git a/modules/bwa_mem/1.1/bwa_mem.smk b/modules/bwa_mem/1.1/bwa_mem.smk index beba1e7f5..5d17a32a9 100644 --- a/modules/bwa_mem/1.1/bwa_mem.smk +++ b/modules/bwa_mem/1.1/bwa_mem.smk @@ -16,6 +16,26 @@ import os # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bwa_mem"]` CFG = op.setup_module( @@ -48,8 +68,8 @@ rule _bwa_mem_input_fastq: fastq_1 = CFG["dirs"]["inputs"] + "fastq/{seq_type}--{genome_build}/{sample_id}.R1.fastq.gz", fastq_2 = CFG["dirs"]["inputs"] + "fastq/{seq_type}--{genome_build}/{sample_id}.R2.fastq.gz", run: - op.relative_symlink(input.fastq_1, output.fastq_1) - op.relative_symlink(input.fastq_2, output.fastq_2) + op.absolute_symlink(input.fastq_1, output.fastq_1) + op.absolute_symlink(input.fastq_2, output.fastq_2) rule _bwa_mem_run: @@ -118,7 +138,7 @@ rule _bwa_mem_symlink_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) rule _bwa_mem_symlink_sorted_bam: @@ -130,7 +150,7 @@ rule _bwa_mem_symlink_sorted_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) os.remove(input.bwa_mem_bam) shell("touch {input.bwa_mem_bam}.deleted") @@ -146,8 +166,8 @@ rule _bwa_mem_output_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bam + ".bai") + op.relative_symlink(input.bam, output.bam, in_module=True) + op.relative_symlink(input.bai, output.bam + ".bai", in_module=True) os.remove(input.sorted_bam) shell("touch {input.sorted_bam}.deleted") diff --git a/modules/bwa_mem/1.1/bwa_mem_grouped.smk b/modules/bwa_mem/1.1/bwa_mem_grouped.smk index 22bc7229f..05afba605 100644 --- a/modules/bwa_mem/1.1/bwa_mem_grouped.smk +++ b/modules/bwa_mem/1.1/bwa_mem_grouped.smk @@ -16,6 +16,26 @@ import os # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["bwa_mem"]` CFG = op.setup_module( @@ -50,8 +70,8 @@ rule _bwa_mem_input_fastq: group: CFG["group"]['bwa-mem'] run: - op.relative_symlink(input.fastq_1, output.fastq_1) - op.relative_symlink(input.fastq_2, output.fastq_2) + op.absolute_symlink(input.fastq_1, output.fastq_1) + op.absolute_symlink(input.fastq_2, output.fastq_2) rule _bwa_mem_run: @@ -121,7 +141,7 @@ rule _bwa_mem_symlink_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) rule _bwa_mem_symlink_sorted_bam: @@ -133,7 +153,7 @@ rule _bwa_mem_symlink_sorted_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) + op.absolute_symlink(input.bam, output.bam) os.remove(input.bwa_mem_bam) shell("touch {input.bwa_mem_bam}.deleted") @@ -149,8 +169,8 @@ rule _bwa_mem_output_bam: wildcard_constraints: sample_id = "|".join(sample_ids_bwa_mem) run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bam + ".bai") + op.relative_symlink(input.bam, output.bam, in_module=True) + op.relative_symlink(input.bai, output.bam + ".bai", in_module=True) os.remove(input.sorted_bam) shell("touch {input.sorted_bam}.deleted") diff --git a/modules/controlfreec/1.2/config/default.yaml b/modules/controlfreec/1.2/config/default.yaml index bf32925ee..79fbc09d2 100755 --- a/modules/controlfreec/1.2/config/default.yaml +++ b/modules/controlfreec/1.2/config/default.yaml @@ -28,18 +28,19 @@ lcr-modules: # 3: make a separate fragment of the unknown region and attach to left/right, choosing the longer one, BUT known region should make at least half size of the unknown region # 4: make a separate fragment of the unknown region and do not assign any copy number to this region at all coefficientOfVariation: 0.062 # coefficient used to evaluate window size - the lower, the more windows - contaminationAdjustment: TRUE # if "contamination" value is not provided, it will automaticaly evaluate - degree: '3\&4' # degree of polynomial - 3&4 for WGS (GC-based normalization); 1 for WES (control-read-count-based normalization) + contaminationAdjustment: TRUE # if "contamination" value is not provided, it will automaticaly evaluate. For bugs where contamination detection is stalled, just set contaminationAdjustment to FALSE. + degree: '3\&4' # degree of polynomial - 3&4 for WGS (GC-based normalization); 1 for WES (control-read-count-based normalization). You can comment out degree to let control-freec choose. forceGCcontentNormalization: 1 #0 for WGS; 1 for WES # 0 forces control-base normalization, 1 forces GC intercept: 1 # 0 for control-based (paired) ; 1 for GC-content (unpaired) minCNAlength: 8 # minimum number of consecutive windows to call a CNA #default 1 for WGS; 3 for WES - minMappabilityPerWindow: 0.9 # minimum fraction of mappable positions for a window to be considered + minMappabilityPerWindow: 0.3 # minimum fraction of mappable positions for a window to be considered # set this lower if you want to also use a hard-masked mappability file minimalSubclonePresence: 20 # detects subclones present in x% of cell population - 20 for WGS; 30 for WES (100 means "do not look for subclones") noisyData: TRUE #set TRUE for exomes/FFPE libs to avoid false positives due to non-uniform capture + readCountThreshold: 10 # threshold on the minimal number of reads per window (used for exome-seq or targeted sequencing) (recommended 50 for WES) ploidy: 2 #will select the ploidy that explains the most CNAs (a range can be added and control-freec will assign ploidy based on best fit, ex. 2,3,4) printNA: FALSE telocentromeric: 50000 # size of pre-telomeric and pre-centromeric regions to exclude - uniqueMatch: TRUE # uses mappability profile to correct read counts + uniqueMatch: FALSE # uses mappability profile to correct read counts #optional options: (uncomment these options in config_WGS.txt to implement them) #if implemented, contamination will overrule contaminationAdjustment @@ -54,21 +55,36 @@ lcr-modules: minQualityPerPosition: 20 # for BAF: minimum base quality shiftInQuality: 0 # basis for Phred quality + #GEM options: (for generating hard-masked mappability files) + hard_masked: True # set True if using a hard-masked mappability file + kmer: 100 # kmer size + mismatch: 2 # maximum number of mismatches allowed + maxBigIndel: 5 # The GEM mapper implements a special algorithm that, in addition to ordinary matches, is sometimes able to find a single long indel - this is the max size + maxEditDistance: 0 # maximum number of edit operations allowed while verifying candidate matches by dynamic programming (can be a float 0-1, which represents differences of size n% of length, or a non-negative integer, which is a fixed number of edits) + strata: 0 # a stratum is a set of matches all having the same string distance from the query, GEM mapper will try to find n amount of matches to explore + software: - FREEC_sig: "{MODSDIR}/etc/scripts/assess_significance.R" - FREEC_graph: "{MODSDIR}/etc/scripts/makeGraph.R" - FREEC_graph_chr: "{MODSDIR}/etc/scripts/makeGraph_Chromosome.R" - freec2bed: "{MODSDIR}/etc/scripts/freec2bed.pl" + FREEC_sig: "{MODSDIR}/src/assess_significance.R" + FREEC_graph: "{MODSDIR}/src/makeGraph.R" + FREEC_graph_chr: "{MODSDIR}/src/makeGraph_Chromosome.R" + freec2bed: "{MODSDIR}/src/freec2bed.pl" + freec2circos: "{MODSDIR}/src/freec2circos.pl" + cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" threads: + gem: 24 controlfreec_run: 24 calc_sig: 1 plot: 1 freec2bed: 1 + freec2circos: 1 + cnv2igv: 1 resources: + gem: + mem_mb: 16000 mpileup: mem_mb: 8000 cat: @@ -83,6 +99,10 @@ lcr-modules: mem_mb: 1000 freec2bed: mem_mb: 1000 + freec2circos: + mem_mb: 1000 + cnv2igv: + mem_mb: 1000 pairing_config: diff --git a/modules/controlfreec/1.2/config/freec/config_WGS.txt b/modules/controlfreec/1.2/config/freec/config_WGS.txt index 44f46c954..1837430a7 100644 --- a/modules/controlfreec/1.2/config/freec/config_WGS.txt +++ b/modules/controlfreec/1.2/config/freec/config_WGS.txt @@ -26,6 +26,7 @@ minimalSubclonePresence = minimumSubclonePresenceValue noisyData = booNoise printNA = naBoo ploidy = ploidyInput +readCountThreshold = rcCountThresold #step = stepValue telocentromeric = teloValue uniqueMatch = uniqBoo @@ -48,7 +49,6 @@ mateOrientation = FR [BAF] -fastaFile = fastaPath shiftInQuality = phredQuality SNPfile = DBsnpFile minimalCoveragePerPosition = minCovPerPos diff --git a/modules/controlfreec/1.2/controlfreec.smk b/modules/controlfreec/1.2/controlfreec.smk index af1da74b0..bb7e84ce6 100755 --- a/modules/controlfreec/1.2/controlfreec.smk +++ b/modules/controlfreec/1.2/controlfreec.smk @@ -15,6 +15,26 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["controlfreec"]` CFG = op.setup_module( @@ -35,45 +55,136 @@ localrules: ##### RULES ##### +#### Rules for mappability reference +# to generate and use hard-masked mappability (i.e. recommended for FFPE genomes) if CFG["options"]["hard_masked"] == True +# to use the default genome's mappability file (downloaded from their website), set it CFG["options"]["hard_masked"] == False +if CFG["options"]["hard_masked"] == True: + CFG["runs"]["masked"] = "_masked" +else: + CFG["runs"]["masked"] = "" + +wildcard_constraints: + masked = ".{0}|_masked", + genome_build = ".+(?\S+).+/$1/;print;' > {output.reference} " + +def get_genome_fasta(wildcards): + CFG = config["lcr-modules"]["controlfreec"] + if "grch" in str({wildcards.genome_build}): + return CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/genome_header.fa" + else: + return reference_files("genomes/{genome_build}{masked}/genome_fasta/genome.fa") + +if CFG["options"]["hard_masked"] == True: + rule _generate_gem_index: + input: + software = CFG["dirs"]["inputs"] + "references/GEM/.done", + reference = get_genome_fasta + output: + index = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all_index.gem" + params: + gemDir = CFG["dirs"]["inputs"] + "references/GEM/GEM-binaries-Linux-x86_64-core_i3-20130406-045632/bin", + idxpref = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all_index" + threads: CFG["threads"]["gem"] + resources: **CFG["resources"]["gem"] + log: CFG["logs"]["inputs"] + "gem/{genome_build}{masked}/gem_index.stderr.log" + shell: + "PATH=$PATH:{params.gemDir}; {params.gemDir}/gem-indexer -T {threads} -c dna -i {input.reference} -o {params.idxpref} > {log} 2>&1 " + +if CFG["options"]["hard_masked"] == True: + rule _generate_mappability: + input: + software = CFG["dirs"]["inputs"] + "references/GEM/.done", + index = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all_index.gem" + output: + mappability = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all.gem.mappability" + params: + gemDir = CFG["dirs"]["inputs"] + "references/GEM/GEM-binaries-Linux-x86_64-core_i3-20130406-045632/bin", + pref = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all.gem", + kmer = CFG["options"]["kmer"], + mismatch = CFG["options"]["mismatch"], + maxEditDistance = CFG["options"]["maxEditDistance"], + maxBigIndel = CFG["options"]["maxBigIndel"], + strata = CFG["options"]["strata"] + threads: CFG["threads"]["gem"] + resources: **CFG["resources"]["gem"] + log: CFG["logs"]["inputs"] + "gem/{genome_build}{masked}/gem_map.stderr.log" + shell: + "PATH=$PATH:{params.gemDir}; {params.gemDir}/gem-mappability -T {threads} -I {input.index} -l {params.kmer} -m {params.mismatch} -t disable --mismatch-alphabet ACGNT -e {params.maxEditDistance} --max-big-indel-length {params.maxBigIndel} -s {params.strata} -o {params.pref} > {log} 2>&1 " + +if CFG["options"]["hard_masked"] == True: + rule _symlink_map: + input: + mappability = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.hardmask.all.gem.mappability" + output: + mappability = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/out100m2_{genome_build}.gem" + resources: **CFG["resources"]["gem"] + shell: + "ln -srf {input.mappability} {output.mappability} " + +#### Rule for setting chromosome names (chr-prefix or not) # no chr for grch37 and grch38 # chr for hg19 and hg38 -# Symlink chromosomes used (i.e. chr1-22,X,Y) -checkpoint _controlfreec_input_chrs: - input: - chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes_withY.txt") - output: - chrs = CFG["dirs"]["inputs"] + "references/{genome_build}/main_chromosomes_withY.txt" - run: - op.relative_symlink(input.chrs, output.chrs) +# chromosomes used (i.e. chr1-22,X,Y) def _controlfreec_get_chr_fastas(wildcards): CFG = config["lcr-modules"]["controlfreec"] - chrs = checkpoints._controlfreec_input_chrs.get(**wildcards).output.chrs + chrs = reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes_withY.txt") with open(chrs) as file: chromosome = file.read().rstrip("\n").split("\n") fastas = expand( @@ -86,9 +197,11 @@ def _controlfreec_get_chr_fastas(wildcards): #generates file with chromomsome lengths from genome.fa.fai rule _controlfreec_generate_chrLen: input: - fai = reference_files("genomes/{genome_build}/genome_fasta/genome.fa.fai") + fai = reference_files("genomes/{genome_build}{masked}/genome_fasta/genome.fa.fai"), + main = reference_files("genomes/{genome_build}{masked}/genome_fasta/main_chromosomes_withY.txt") output: - chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/{genome_build}.len" + chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.len" + resources: **CFG["resources"]["gem"] shell: op.as_one_line(""" grep -P '^chr[0-9,X,Y]+\t|^[0-9,X,Y]' {input.fai} | awk '{{print $1"\t"$2}}' > {output.chrLen} @@ -102,6 +215,7 @@ rule _controlfreec_generate_chrFasta: fasta = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/chr/{chromosome}.fa" conda: CFG["conda_envs"]["controlfreec"] + resources: **CFG["resources"]["gem"] shell: "samtools faidx {input.fasta} {wildcards.chromosome} > {output.fasta} " @@ -118,6 +232,7 @@ rule _controlfreec_dbsnp_to_bed: vcf = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz") output: bed = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/dbsnp.common_all-151.bed" + resources: **CFG["resources"]["gem"] shell: op.as_one_line(""" gunzip -c {input.vcf} | awk {{'printf ("%s\\t%s\\t%s\\n", $1,$2-1,$2)'}} | zgrep -v -h "^#" > {output.bed} """) @@ -129,16 +244,18 @@ rule _controlfreec_input_bam: bai = CFG["inputs"]["sample_bai"] output: bam = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bam", - bai = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bai" + bai = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.bai", + crai = CFG["dirs"]["inputs"] + "{seq_type}--{genome_build}/{sample_id}.crai" run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bai) + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + op.absolute_symlink(input.bai, output.crai) #### set-up mpileups for BAF calling #### def _controlfreec_get_chr_mpileups(wildcards): CFG = config["lcr-modules"]["controlfreec"] - chrs = checkpoints._controlfreec_input_chrs.get(**wildcards).output.chrs + chrs = reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes_withY.txt") with open(chrs) as file: chrs = file.read().rstrip("\n").split("\n") mpileups = expand( @@ -165,17 +282,16 @@ rule _controlfreec_mpileup_per_chrom: shell: "samtools mpileup -l {input.bed} -r {wildcards.chrom} -Q 20 -f {input.fastaFile} {input.bam} | gzip -c > {output.pileup} 2> {log.stderr}" - rule _controlfreec_concatenate_pileups: input: - _controlfreec_get_chr_mpileups + mpileup = _controlfreec_get_chr_mpileups output: mpileup = temp(CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{sample_id}.bam_minipileup.pileup.gz") resources: **CFG["resources"]["cat"] group: "controlfreec" shell: - "cat {input} > {output.mpileup} " + "cat {input.mpileup} > {output.mpileup} " #### Run control-FREEC #### @@ -185,19 +301,18 @@ rule _controlfreec_config: input: tumour_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{tumour_id}.bam_minipileup.pileup.gz", normal_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{normal_id}.bam_minipileup.pileup.gz", - fastaFile = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), - reference = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/out100m2_{genome_build}.gem", - chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/{genome_build}.len", + reference = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/out100m2_{genome_build}.gem", + chrLen = CFG["dirs"]["inputs"] + "references/{genome_build}{masked}/freec/{genome_build}.len", done = CFG["dirs"]["inputs"] + "references/{genome_build}/freec/chr/.all_done" output: - CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt" + CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt" conda: CFG["conda_envs"]["controlfreec"] params: config = CFG["options"]["configFile"], dbSNP = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), shiftInQuality = CFG["options"]["shiftInQuality"], - outdir = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/", + outdir = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/", window = CFG["options"]["window"], ploidy = CFG["options"]["ploidy"], breakPointValue = CFG["options"]["breakPointThreshold"], @@ -217,6 +332,7 @@ rule _controlfreec_config: minimumSubclonePresence = CFG["options"]["minimalSubclonePresence"], naBoo = CFG["options"]["printNA"], noisyData = CFG["options"]["noisyData"], + readCountThreshold = CFG["options"]["readCountThreshold"], step = CFG["options"]["step"], telocentromeric = CFG["options"]["telocentromeric"], threads = CFG["threads"]["controlfreec_run"], @@ -231,7 +347,6 @@ rule _controlfreec_config: "sed \"s|BAMFILE|{input.tumour_bam}|g\" {params.config} | " "sed \"s|CONTROLFILE|{input.normal_bam}|g\" | " "sed \"s|OUTDIR|{params.outdir}|g\" | " - "sed \"s|fastaPath|{input.fastaFile}|g\" | " "sed \"s|DBsnpFile|{params.dbSNP}|g\" | " "sed \"s|phredQuality|{params.shiftInQuality}|g\" | " "sed \"s|windowSize|{params.window}|g\" | " @@ -257,6 +372,7 @@ rule _controlfreec_config: "sed \"s|minQualPerPos|{params.minimalQualityPerPosition}|g\" | " "sed \"s|booNoise|{params.noisyData}|g\" | " "sed \"s|stepValue|{params.step}|g\" | " + "sed \"s|rcCountThresold|{params.readCountThreshold}|g\" | " "sed \"s|teloValue|{params.telocentromeric}|g\" | " "sed \"s|uniqBoo|{params.uniqBoo}|g\" | " "sed \"s|naBoo|{params.naBoo}|g\" | " @@ -266,81 +382,116 @@ rule _controlfreec_config: rule _controlfreec_run: input: - config = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt", + config = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/config_WGS.txt", tumour_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{tumour_id}.bam_minipileup.pileup.gz", normal_bam = CFG["dirs"]["mpileup"] + "{seq_type}--{genome_build}/{normal_id}.bam_minipileup.pileup.gz", output: - info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt", - ratio = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", - CNV = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", - BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt" + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt", + ratio = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + CNV = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", + BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt" conda: CFG["conda_envs"]["controlfreec"] threads: CFG["threads"]["controlfreec_run"] resources: **CFG["resources"]["controlfreec_run"] log: - stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/run.stdout.log", - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/run.stderr.log" + stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/run.stdout.log", + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/run.stderr.log" shell: "freec -conf {input.config} > {log.stdout} 2> {log.stderr} " rule _controlfreec_calc_sig: input: - CNVs = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", - ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + CNVs = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs", + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", output: - txt = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs.p.value.txt" + txt = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs.p.value.txt" params: calc_sig = CFG["software"]["FREEC_sig"] threads: CFG["threads"]["calc_sig"] resources: **CFG["resources"]["calc_sig"] conda: CFG["conda_envs"]["controlfreec"] log: - stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stdout.log", - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stderr.log" + stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stdout.log", + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/calc_sig.stderr.log" shell: "cat {params.calc_sig} | R --slave --args {input.CNVs} {input.ratios} > {log.stdout} 2> {log.stderr}" rule _controlfreec_plot: input: - ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", - BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt", - info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + BAF = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt", + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" output: - plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.png", - log2plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.log2.png", - bafplot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt.png" + plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.png", + log2plot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt.log2.png", + bafplot = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_BAF.txt.png" params: plot = CFG["software"]["FREEC_graph"] threads: CFG["threads"]["plot"] resources: **CFG["resources"]["plot"] conda: CFG["conda_envs"]["controlfreec"] log: - stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot.stdout.log", - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot.stderr.log" + stdout = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/plot.stdout.log", + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/plot.stderr.log" shell: "cat {params.plot} | R --slave --args `grep \"Output_Ploidy\" {input.info} | cut -f 2` {input.ratios} {input.BAF} > {log.stdout} 2> {log.stderr} " rule _controlfreec_freec2bed: input: - ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", - info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" output: - bed = CFG["dirs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bed" + bed = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bed" params: freec2bed = CFG["software"]["freec2bed"] threads: CFG["threads"]["freec2bed"] resources: **CFG["resources"]["freec2bed"] conda: CFG["conda_envs"]["controlfreec"] log: - stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/freec2bed.stderr.log" + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/freec2bed.stderr.log" shell: "ploidy=$(grep Output_Ploidy {input.info} | cut -f 2); " "perl {params.freec2bed} -f {input.ratios} -p $ploidy > {output.bed} 2> {log.stderr}" +rule _controlfreec_freec2circos: + input: + ratios = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_ratio.txt", + info = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_info.txt" + output: + circos = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.circos.bed" + params: + freec2circos = CFG["software"]["freec2circos"] + threads: CFG["threads"]["freec2circos"] + resources: **CFG["resources"]["freec2circos"] + conda: CFG["conda_envs"]["controlfreec"] + log: + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/freec2circos.stderr.log" + shell: + "ploidy=$(grep Output_Ploidy {input.info} | cut -f 2); " + "perl {params.freec2circos} -f {input.ratios} -p $ploidy > {output.circos} 2> {log.stderr}" + + +rule _controlfreec_cnv2igv: + input: + cnv = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.bam_minipileup.pileup.gz_CNVs.p.value.txt" + output: + seg = CFG["dirs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.CNVs.seg" + params: + tumour_id = "{tumour_id}", + cnv2igv = CFG["software"]["cnv2igv"] + threads: CFG["threads"]["cnv2igv"] + resources: **CFG["resources"]["cnv2igv"] + conda: CFG["conda_envs"]["controlfreec"] + log: + stderr = CFG["logs"]["run"] + "{seq_type}--{genome_build}{masked}/{tumour_id}--{normal_id}--{pair_status}/cnv2igv.stderr.log" + shell: + "python3 {params.cnv2igv} --mode controlfreec --sample {params.tumour_id} {input.cnv} > {output.seg} 2> {log.stderr} " + + # Symlinks the final output files into the module results directory (under '99-outputs/') rule _controlfreec_output: input: @@ -350,23 +501,29 @@ rule _controlfreec_output: bed = str(rules._controlfreec_freec2bed.output.bed), BAF = str(rules._controlfreec_run.output.BAF), BAFgraph = str(rules._controlfreec_plot.output.bafplot), - ratio = str(rules._controlfreec_run.output.ratio) + ratio = str(rules._controlfreec_run.output.ratio), + circos = str(rules._controlfreec_freec2circos.output.circos), + igv = str(rules._controlfreec_cnv2igv.output.seg) output: - plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/plots/{tumour_id}--{normal_id}--{pair_status}.ratio.png", - log2plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/log2plots/{tumour_id}--{normal_id}--{pair_status}.ratio.log2.png", - CNV = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/CNV/{tumour_id}--{normal_id}--{pair_status}.CNVs.p.value.txt", - bed = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/bed/{tumour_id}--{normal_id}--{pair_status}.CNVs.bed", - BAF = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/BAF/{tumour_id}--{normal_id}--{pair_status}.BAF.txt", - BAFgraph = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/BAFplot/{tumour_id}--{normal_id}--{pair_status}.BAF.png", - ratio = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/ratio/{tumour_id}--{normal_id}--{pair_status}.ratio.txt" + plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/plots/{tumour_id}--{normal_id}--{pair_status}.ratio.png", + log2plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/log2plots/{tumour_id}--{normal_id}--{pair_status}.ratio.log2.png", + CNV = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/CNV/{tumour_id}--{normal_id}--{pair_status}.CNVs.p.value.txt", + bed = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/bed/{tumour_id}--{normal_id}--{pair_status}.CNVs.bed", + BAF = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/BAF/{tumour_id}--{normal_id}--{pair_status}.BAF.txt", + BAFgraph = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/BAFplot/{tumour_id}--{normal_id}--{pair_status}.BAF.png", + ratio = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/ratio/{tumour_id}--{normal_id}--{pair_status}.ratio.txt", + circos = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/circos/{tumour_id}--{normal_id}--{pair_status}.circos.bed", + igv = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}{masked}/igv/{tumour_id}--{normal_id}--{pair_status}.igv.seg" run: - op.relative_symlink(input.plot, output.plot) - op.relative_symlink(input.log2plot, output.log2plot) - op.relative_symlink(input.CNV, output.CNV) - op.relative_symlink(input.bed, output.bed) - op.relative_symlink(input.BAF, output.BAF) - op.relative_symlink(input.BAFgraph, output.BAFgraph) - op.relative_symlink(input.ratio, output.ratio) + op.relative_symlink(input.plot, output.plot, in_module = True) + op.relative_symlink(input.log2plot, output.log2plot, in_module = True) + op.relative_symlink(input.CNV, output.CNV, in_module = True) + op.relative_symlink(input.bed, output.bed, in_module = True) + op.relative_symlink(input.BAF, output.BAF, in_module = True) + op.relative_symlink(input.BAFgraph, output.BAFgraph, in_module = True) + op.relative_symlink(input.ratio, output.ratio, in_module = True) + op.relative_symlink(input.circos, output.circos, in_module = True) + op.relative_symlink(input.igv, output.igv, in_module = True) # Generates the target sentinels for each run, which generate the symlinks @@ -380,14 +537,17 @@ rule _controlfreec_all: str(rules._controlfreec_output.output.bed), str(rules._controlfreec_output.output.BAF), str(rules._controlfreec_output.output.BAFgraph), - str(rules._controlfreec_output.output.ratio) + str(rules._controlfreec_output.output.ratio), + str(rules._controlfreec_output.output.circos), + str(rules._controlfreec_output.output.igv) ], zip, # Run expand() with zip(), not product() seq_type=CFG["runs"]["tumour_seq_type"], genome_build=CFG["runs"]["tumour_genome_build"], pair_status=CFG["runs"]["pair_status"], tumour_id=CFG["runs"]["tumour_sample_id"], - normal_id=CFG["runs"]["normal_sample_id"]) + normal_id=CFG["runs"]["normal_sample_id"], + masked=CFG["runs"]["masked"]) diff --git a/modules/controlfreec/1.2/etc/scripts/_makeGraph_Chromosome.R b/modules/controlfreec/1.2/src/_makeGraph_Chromosome.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/_makeGraph_Chromosome.R rename to modules/controlfreec/1.2/src/_makeGraph_Chromosome.R diff --git a/modules/controlfreec/1.2/etc/scripts/assess_significance.R b/modules/controlfreec/1.2/src/assess_significance.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/assess_significance.R rename to modules/controlfreec/1.2/src/assess_significance.R diff --git a/modules/controlfreec/1.2/etc/scripts/freec2bed.pl b/modules/controlfreec/1.2/src/freec2bed.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/freec2bed.pl rename to modules/controlfreec/1.2/src/freec2bed.pl diff --git a/modules/controlfreec/1.2/etc/scripts/freec2circos.pl b/modules/controlfreec/1.2/src/freec2circos.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/freec2circos.pl rename to modules/controlfreec/1.2/src/freec2circos.pl diff --git a/modules/controlfreec/1.2/etc/scripts/get_fasta_lengths.pl b/modules/controlfreec/1.2/src/get_fasta_lengths.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/get_fasta_lengths.pl rename to modules/controlfreec/1.2/src/get_fasta_lengths.pl diff --git a/modules/controlfreec/1.2/etc/scripts/makeGraph.R b/modules/controlfreec/1.2/src/makeGraph.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/makeGraph.R rename to modules/controlfreec/1.2/src/makeGraph.R diff --git a/modules/controlfreec/1.2/etc/scripts/makeGraph_Chromosome.R b/modules/controlfreec/1.2/src/makeGraph_Chromosome.R similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/makeGraph_Chromosome.R rename to modules/controlfreec/1.2/src/makeGraph_Chromosome.R diff --git a/modules/controlfreec/1.2/etc/scripts/vcf2snpFreec.pl b/modules/controlfreec/1.2/src/vcf2snpFreec.pl similarity index 100% rename from modules/controlfreec/1.2/etc/scripts/vcf2snpFreec.pl rename to modules/controlfreec/1.2/src/vcf2snpFreec.pl diff --git a/modules/controlfreec/CHANGELOG.md b/modules/controlfreec/CHANGELOG.md index e1b646c2e..d20964b8f 100755 --- a/modules/controlfreec/CHANGELOG.md +++ b/modules/controlfreec/CHANGELOG.md @@ -58,4 +58,9 @@ Notably, in paired mode, with BAF mode on, FREEC normalizes with GC-content, and This implementation has been tested on unmatched samples too using a high coverage, normal FFPE sample, and it has shown to display clean profiles in these cases too. -Note: this version is not meant for capture/exome data. \ No newline at end of file +Note: this version is not meant for capture/exome data. + +## [1.2] patch 2021-02-25 +Added GEM mappability features - can now use/generate a hard-masked mappability file (useful for FFPE genomes) with the setting "hard_masked" = True. If this is set, GEM will be installed and ran on your reference genome of choice. + +Also added freec2circos function. \ No newline at end of file diff --git a/modules/gridss/1.1/gridss.smk b/modules/gridss/1.1/gridss.smk index 51483f749..b4935ce10 100644 --- a/modules/gridss/1.1/gridss.smk +++ b/modules/gridss/1.1/gridss.smk @@ -15,6 +15,26 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["gridss"]` @@ -24,13 +44,13 @@ CFG = op.setup_module( subdirectories = ["inputs", "preprocess", "gridss", "viral_annotation", "gripss", "outputs"], ) -VERSION_MAP = { +GRIDSS_VERSION_MAP = { "grch37": "hg19", "hs37d5": "hg19", "hg38": "hg38" } -possible_genome_builds = VERSION_MAP.keys() +possible_genome_builds = GRIDSS_VERSION_MAP.keys() for genome_build in CFG["runs"]["tumour_genome_build"]: assert genome_build in possible_genome_builds, ( "Samples table includes genome builds not yet compatible with this module. " @@ -85,7 +105,7 @@ rule _gridss_get_pon: pon_breakend = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/gridss_pon_single_breakend.bed", known_pairs = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/KnownFusionPairs.bedpe" params: - alt_build = lambda w: VERSION_MAP[w.genome_build], + alt_build = lambda w: GRIDSS_VERSION_MAP[w.genome_build], url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/gridss/pon" shell: op.as_one_line(""" @@ -162,8 +182,8 @@ rule _gridss_input_bam: sample_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", sample_bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai" run: - op.relative_symlink(input.sample_bam, output.sample_bam) - op.relative_symlink(input.sample_bai, output.sample_bai) + op.absolute_symlink(input.sample_bam, output.sample_bam) + op.absolute_symlink(input.sample_bai, output.sample_bai) # Preprocess unmatched normal bams rule _gridss_preprocess_unmatched_normal: @@ -206,12 +226,12 @@ rule _gridss_symlink_preprocessed_normal: input: workdir = str(rules._gridss_preprocess_unmatched_normal.output.workdir) output: - workdir = temp(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{sample_id}.bam.gridss.working") + workdir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{sample_id}.bam.gridss.working")) priority: 0 wildcard_constraints: sample_id = "|".join(unmatched_normal_ids) run: - op.relative_symlink(input.workdir, output.workdir) + op.absolute_symlink(input.workdir, output.workdir) # Preprocess all other bams as part of the group job rule _gridss_preprocess: @@ -488,9 +508,9 @@ rule _gridss_output_viral_vcf: tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_viral_annotation_filtered.vcf.gz.tbi", bedpe = CFG["dirs"]["outputs"] + "bedpe/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_viral_annotation_filtered.bedpe" run: - op.relative_symlink(input.vcf, output.vcf) - op.relative_symlink(input.tbi, output.tbi) - op.relative_symlink(input.bedpe, output.bedpe) + op.relative_symlink(input.vcf, output.vcf, in_module=True) + op.relative_symlink(input.tbi, output.tbi, in_module=True) + op.relative_symlink(input.bedpe, output.bedpe, in_module=True) rule _gridss_output_somatic_vcf: input: @@ -506,11 +526,11 @@ rule _gridss_output_somatic_vcf: filtered_tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.vcf.gz.tbi", bedpe = CFG["dirs"]["outputs"] + "bedpe/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.bedpe" run: - op.relative_symlink(input.somatic, output.somatic) - op.relative_symlink(input.somatic_tbi, output.somatic_tbi) - op.relative_symlink(input.filtered, output.filtered) - op.relative_symlink(input.filtered_tbi, output.filtered_tbi) - op.relative_symlink(input.bedpe, output.bedpe) + op.relative_symlink(input.somatic, output.somatic, in_module=True) + op.relative_symlink(input.somatic_tbi, output.somatic_tbi, in_module=True) + op.relative_symlink(input.filtered, output.filtered, in_module=True) + op.relative_symlink(input.filtered_tbi, output.filtered_tbi, in_module=True) + op.relative_symlink(input.bedpe, output.bedpe, in_module=True) def _gridss_predict_output(wildcards): """Request symlinks for all VCF files. diff --git a/modules/gridss/2.0/config/default.yaml b/modules/gridss/2.0/config/default.yaml new file mode 100644 index 000000000..d1cf1624f --- /dev/null +++ b/modules/gridss/2.0/config/default.yaml @@ -0,0 +1,62 @@ +lcr-modules: + + gridss: + + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + + scratch_subdirectories: [] # Recommended: ["gridss", "preprocess"] + + options: + gridss: + --picardoptions VALIDATION_STRINGENCY=SILENT + filter_unpaired: + gripss: + # Hard filters remove variants from output VCF + # Soft filters add flags to output VCF + # These flags don't work with the current version of GRIPSS + # A fix is being prepared by the developers + -hard_max_normal_absolute_support 3 + -hard_max_normal_relative_support 0.06 + -soft_max_normal_relative_support 0.03 + + conda_envs: + wget: "{MODSDIR}/envs/wget-1.20.1.yaml" + gridss: "{MODSDIR}/envs/gridss-2.12.0.yaml" + gripss: "{MODSDIR}/envs/hmftools-gripss-1.11.yaml" + bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml" + svtools: "{MODSDIR}/envs/svtools-0.5.1.yaml" + + threads: + preprocess: 8 + gridss: 24 + repeatmasker: 24 + filter_gridss: 1 + gripss: 1 # Not multi-threaded + split: 1 + + resources: + preprocess: + mem_mb: 37500 + preprocess: 1 + gridss: + mem_mb: 37500 # Recommended per GRIDSS manual + gridss: 1 + repeatmasker: + mem_mb: 100000 + gripss: + mem_mb: 20000 # May need to be increased for FFPE tumours + split: + mem_mb: 2000 + + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False + capture: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False diff --git a/modules/gridss/2.0/envs/bcftools-1.10.2.yaml b/modules/gridss/2.0/envs/bcftools-1.10.2.yaml new file mode 120000 index 000000000..72959e7bb --- /dev/null +++ b/modules/gridss/2.0/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/gridss-2.12.0.yaml b/modules/gridss/2.0/envs/gridss-2.12.0.yaml new file mode 120000 index 000000000..d827cc395 --- /dev/null +++ b/modules/gridss/2.0/envs/gridss-2.12.0.yaml @@ -0,0 +1 @@ +../../../../envs/gridss/gridss-2.12.0.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/gridss-dependencies-2.9.4.yaml b/modules/gridss/2.0/envs/gridss-dependencies-2.9.4.yaml new file mode 120000 index 000000000..b7fb269d0 --- /dev/null +++ b/modules/gridss/2.0/envs/gridss-dependencies-2.9.4.yaml @@ -0,0 +1 @@ +../../../../envs/gridss/gridss-dependencies-2.9.4.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/hmftools-gripss-1.11.yaml b/modules/gridss/2.0/envs/hmftools-gripss-1.11.yaml new file mode 120000 index 000000000..6a1656b5d --- /dev/null +++ b/modules/gridss/2.0/envs/hmftools-gripss-1.11.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-gripss-1.11.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/hmftools-gripss-1.4.0.yaml b/modules/gridss/2.0/envs/hmftools-gripss-1.4.0.yaml new file mode 120000 index 000000000..ca91e8c3f --- /dev/null +++ b/modules/gridss/2.0/envs/hmftools-gripss-1.4.0.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-gripss-1.4.0.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/hmftools-gripss-1.8.yaml b/modules/gridss/2.0/envs/hmftools-gripss-1.8.yaml new file mode 120000 index 000000000..b0c2af4a4 --- /dev/null +++ b/modules/gridss/2.0/envs/hmftools-gripss-1.8.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-gripss-1.8.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/svtools-0.5.1.yaml b/modules/gridss/2.0/envs/svtools-0.5.1.yaml new file mode 120000 index 000000000..6dc2ec0ca --- /dev/null +++ b/modules/gridss/2.0/envs/svtools-0.5.1.yaml @@ -0,0 +1 @@ +../../../../envs/svtools/svtools-0.5.1.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/envs/wget-1.20.1.yaml b/modules/gridss/2.0/envs/wget-1.20.1.yaml new file mode 120000 index 000000000..86501e72a --- /dev/null +++ b/modules/gridss/2.0/envs/wget-1.20.1.yaml @@ -0,0 +1 @@ +../../../../envs/wget/wget-1.20.1.yaml \ No newline at end of file diff --git a/modules/gridss/2.0/gridss.smk b/modules/gridss/2.0/gridss.smk new file mode 100644 index 000000000..99cbdd275 --- /dev/null +++ b/modules/gridss/2.0/gridss.smk @@ -0,0 +1,508 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: Laura Hilton +# Module Author: Laura Hilton +# Contributors: N/A + + +##### SETUP ##### + + +# Import package with useful functions for developing analysis modules +import oncopipe as op + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["gridss"]` +CFG = op.setup_module( + name = "gridss", + version = "2.0", + subdirectories = ["inputs", "preprocess", "gridss", "repeatmasker", "gripss", "outputs"], +) + +VERSION_MAP_GRIDSS = { + "grch37": "hg19", + "hs37d5": "hg19", + "hg38": "hg38" +} + +possible_genome_builds = VERSION_MAP_GRIDSS.keys() +for genome_build in CFG["runs"]["tumour_genome_build"]: + assert genome_build in possible_genome_builds, ( + "Samples table includes genome builds not yet compatible with this module. " + "This module is currently only compatible with {possible_genome_builds}. " + ) + +sample_ids = list(CFG['samples']['sample_id']) +unmatched_normal_ids = list(config["lcr-modules"]["_shared"]["unmatched_normal_ids"].values()) +all_other_ids = list(set(sample_ids) - set(unmatched_normal_ids)) + +# Define rules to be run locally when using a compute cluster +localrules: + _gridss_input_bam, + _gridss_input_references, + _gridss_setup_references, + _gridss_get_pon, + _gridss_symlink_preprocessed_normal, + _gridss_filter_gripss, + _gridss_gripss_to_bedpe, + _gridss_output_somatic_vcf, + _gridss_all + + + +##### RULES ##### + +# Symlink genome fasta with bwa and .fai indices to the same directory +rule _gridss_input_references: + input: + genome_fa = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), + genome_bwa_prefix = reference_files("genomes/{genome_build}/bwa_index/bwa-0.7.17/genome.fa"), + output: + genome_fa = CFG["dirs"]["inputs"] + "references/{genome_build}/genome_fa/genome.fa", + shell: + op.as_one_line(""" + ln -sf {input.genome_fa} {output.genome_fa} && + ln -sf {input.genome_fa}.fai {output.genome_fa}.fai && + ln -sf {input.genome_bwa_prefix}.* `dirname {output.genome_fa}` + """) + +# Download the panel of normals +rule _gridss_get_pon: + output: + pon_breakpoint = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/gridss_pon_breakpoint.bedpe", + pon_breakend = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/gridss_pon_single_breakend.bed", + known_pairs = CFG["dirs"]["inputs"] + "references/{genome_build}/pon/KnownFusionPairs.bedpe" + params: + alt_build = lambda w: VERSION_MAP_GRIDSS[w.genome_build], + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/gridss/pon" + shell: + op.as_one_line(""" + wget -O {output.pon_breakpoint} {params.url}/gridss_pon_breakpoint.{params.alt_build}.bedpe; + wget -O {output.pon_breakend} {params.url}/gridss_pon_single_breakend.{params.alt_build}.bed; + wget -O {output.known_pairs} {params.url}/KnownFusionPairs.{params.alt_build}.bedpe + """) + + +# Generage genome.fa.img file +rule _gridss_setup_references: + input: + fasta = str(rules._gridss_input_references.output.genome_fa), + output: + genome_img = CFG["dirs"]["inputs"] + "references/{genome_build}/genome_fa/genome.fa.img" + params: + steps = "setupreference" + conda: + CFG["conda_envs"]["gridss"] + resources: + mem_mb = 4000 + threads: 8 + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --threads {threads} + --jvmheap 3G + --steps {params.steps} + --workingdir `dirname {output.genome_img}` + """) + + +# Symlink the input files into the module results directory (under '00-inputs/') +rule _gridss_input_bam: + input: + sample_bam = CFG["inputs"]["sample_bam"], + sample_bai = CFG["inputs"]["sample_bai"] + output: + sample_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + sample_bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai" + run: + op.absolute_symlink(input.sample_bam, output.sample_bam) + op.absolute_symlink(input.sample_bai, output.sample_bai) + +# Preprocess unmatched normal bams +rule _gridss_preprocess_unmatched_normal: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + fasta = str(rules._gridss_input_references.output.genome_fa), + fasta_img = str(rules._gridss_setup_references.output.genome_img) + output: + workdir = directory(CFG["dirs"]["preprocess"] + "{seq_type}--{genome_build}/{sample_id}.bam.gridss.working") + log: CFG["logs"]["preprocess"] + "{seq_type}--{genome_build}/{sample_id}/preprocess.log" + params: + opts = CFG["options"]["gridss"], + steps = "preprocess", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["gridss"] + resources: + **CFG["resources"]["gridss"] + priority: 1 + wildcard_constraints: + sample_id="|".join(unmatched_normal_ids) + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --workingdir $(dirname {output.workdir}) + --threads {threads} + --jvmheap {params.mem_mb}m + --steps {params.steps} + {params.opts} + {input.bam} + 2>&1 | tee -a {log} + """) + +# Symlink preprocessed sv.bam directories + +rule _gridss_symlink_preprocessed_normal: + input: + workdir = str(rules._gridss_preprocess_unmatched_normal.output.workdir) + output: + workdir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{sample_id}.bam.gridss.working")) + priority: 0 + wildcard_constraints: + sample_id = "|".join(unmatched_normal_ids) + run: + op.absolute_symlink(input.workdir, output.workdir) + +# Preprocess all other bams as part of the group job +rule _gridss_preprocess: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + fasta = str(rules._gridss_input_references.output.genome_fa), + fasta_img = str(rules._gridss_setup_references.output.genome_img) + output: + workdir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{sample_id}.bam.gridss.working")) + log: CFG["logs"]["preprocess"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{sample_id}/preprocess.log" + params: + opts = CFG["options"]["gridss"], + steps = "preprocess", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["preprocess"] + resources: + **CFG["resources"]["preprocess"] + group: "enormous_bam" + wildcard_constraints: + sample_id = "|".join(all_other_ids) + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --workingdir $(dirname {output.workdir}) + --threads {threads} + --jvmheap {params.mem_mb}m + --steps {params.steps} + {params.opts} + {input.bam} + 2>&1 | tee -a {log} + """) + +def get_input_per_patient(wildcards): + CFG = config['lcr-modules']['gridss'] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id) + if wildcards.pair_status in ["matched", "unmatched"]: + SAMPLES = PATIENT['normal_sample_id'].unique().tolist() + PATIENT['tumour_sample_id'].tolist() + bams = expand( + [ + str(rules._gridss_input_bam.output.sample_bam) + ], + zip, + sample_id = SAMPLES, + allow_missing = True + ) + preproc = expand( + [ + str(rules._gridss_preprocess.output.workdir) + ], + zip, + sample_id = SAMPLES, + allow_missing = True + ) + elif wildcards.pair_status == "no_normal": + bams = expand( + [ + str(rules._gridss_input_bam.output.sample_bam) + ], + zip, + sample_id = PATIENT["tumour_sample_id"], + allow_missing = True + ) + preproc = expand( + [ + str(rules._gridss_preprocess.output.workdir) + ], + zip, + sample_id = PATIENT["tumour_sample_id"], + allow_missing = True + ) + return {'bams': bams, 'preproc': preproc} + +def get_input_sample_ids(wildcards): + CFG = config['lcr-modules']['gridss'] + PATIENT = op.filter_samples(CFG["runs"], tumour_patient_id = wildcards.patient_id) + if wildcards.pair_status in ["matched", "unmatched"]: + ids = ",".join([",".join(PATIENT['normal_sample_id'].unique().tolist()), ",".join(PATIENT['tumour_sample_id'].tolist())]) + elif wildcards.pair_status == "no_normal": + ids = ",".join(PATIENT['tumour_sample_id']) + return ids + +# Run GRIDSS in paired mode +rule _gridss_run: + input: + unpack(get_input_per_patient), + fasta = str(rules._gridss_input_references.output.genome_fa), + fasta_img = str(rules._gridss_setup_references.output.genome_img), + blacklist = reference_files("genomes/{genome_build}/encode/encode-blacklist.{genome_build}.bed") + output: + vcf = temp(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_raw.vcf.gz"), + assembly = temp(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/assembly.bam"), + assembly_dir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/assembly.bam.gridss.working")), + vcf_dir = temp(directory(CFG["dirs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_raw.vcf.gz.gridss.working")) + log: CFG["logs"]["gridss"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss.log" + params: + ids = lambda wildcards: get_input_sample_ids(wildcards), + opts = CFG["options"]["gridss"], + steps = "assemble,call", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["gridss"] + resources: + **CFG["resources"]["gridss"] + group: "enormous_bam" + shell: + op.as_one_line(""" + gridss + --reference {input.fasta} + --output {output.vcf} + --workingdir `dirname {output.vcf}` + --assembly {output.assembly} + --blacklist {input.blacklist} + --threads {threads} + --jvmheap {params.mem_mb}m + --labels "{params.ids}" + --steps {params.steps} + {params.opts} + {input.bams} + 2>&1 | tee -a {log} + """) + +# Annotate GRIDSS VCF with Repeatmasker +rule _gridss_annotate_repeatmasker: + input: + vcf = str(rules._gridss_run.output.vcf) + output: + vcf = temp(CFG["dirs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_repeatmasker.vcf.gz"), + tbi = temp(CFG["dirs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_repeatmasker.vcf.gz.tbi") + log: CFG["logs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/gridss_repeatmasker.log" + params: + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gridss"] + threads: + CFG["threads"]["repeatmasker"] + resources: + **CFG["resources"]["repeatmasker"] + shell: + op.as_one_line(""" + gridss_annotate_vcf_repeatmasker + -o {output.vcf} + -t {threads} + -w $(dirname {output.vcf}) + {input.vcf} + > {log} 2>&1 + """) + +def get_split_ids(wildcards): + CFG = config['lcr-modules']['gridss'] + if wildcards.normal_id == "None": + return wildcards.tumour_id + else: + return wildcards.normal_id + "," + wildcards.tumour_id + +rule _gridss_split_vcf: + input: + vcf = str(rules._gridss_run.output.vcf) + output: + vcf = temp(CFG['dirs']['repeatmasker'] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.gridss_split.vcf.gz"), + tbi = temp(CFG['dirs']['repeatmasker'] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.gridss_split.vcf.gz.tbi") + log: CFG["logs"]["repeatmasker"] + "{seq_type}--{genome_build}/{patient_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.gridss_split_vcf.log" + params: + ids = lambda wildcards: get_split_ids(wildcards), + conda: + CFG["conda_envs"]["bcftools"] + threads: CFG['threads']['split'] + resources: + **CFG['resources']['split'] + shell: + op.as_one_line(""" + bcftools view -s {params.ids} -Oz -o {output.vcf} {input.vcf} 2> {log} && + tabix -p vcf {output.vcf} + """) + +def get_split_vcf(wildcards): + CFG = config['lcr-modules']['gridss'] + TUMOUR = op.filter_samples(CFG['runs'], tumour_sample_id = wildcards.tumour_id) + vcf = expand( + str(rules._gridss_split_vcf.output.vcf), + patient_id = TUMOUR['tumour_patient_id'], + allow_missing = True + ) + return {'vcf': vcf} + +def get_gripss_sample_id_cli(wildcards): + CFG = config['lcr-modules']['gridss'] + TUMOUR = op.filter_samples(CFG["runs"], tumour_sample_id = wildcards.tumour_id) + if wildcards.pair_status in ["matched", "unmatched"]: + return "-tumor " + str("".join(TUMOUR['tumour_sample_id'])) + " -reference " + str("".join(TUMOUR['normal_sample_id'])) + elif wildcards.pair_status == "no_normal": + return "-tumor " + str("".join(TUMOUR['tumour_sample_id'])) + +# Perform somatic filtering against the panel of normals +rule _gridss_run_gripss: + input: + unpack(get_split_vcf), + fasta = str(rules._gridss_input_references.output.genome_fa), + pon_breakend = str(rules._gridss_get_pon.output.pon_breakend), + pon_breakpoint = str(rules._gridss_get_pon.output.pon_breakpoint), + known_pairs = str(rules._gridss_get_pon.output.known_pairs) + output: + vcf = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz", + tbi = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz.tbi" + log: log = CFG["logs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gripss.log" + resources: + **CFG["resources"]["gripss"] + params: + cli = lambda wildcards: get_gripss_sample_id_cli(wildcards), + opts = CFG["options"]["gripss"], + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + conda: + CFG["conda_envs"]["gripss"] + threads: + CFG["threads"]["gripss"] + shell: + op.as_one_line(""" + gripss -Xms4G -Xmx{params.mem_mb}m + -ref_genome {input.fasta} + -breakend_pon {input.pon_breakend} + -breakpoint_pon {input.pon_breakpoint} + -breakpoint_hotspot {input.known_pairs} + -input_vcf {input.vcf} + -output_vcf {output.vcf} + {params.cli} + {params.opts} + 2>&1 | tee -a {log} + """) + +rule _gridss_filter_gripss: + input: + vcf = str(rules._gridss_run_gripss.output.vcf) + output: + vcf = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz", + tbi = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz.tbi" + conda: + CFG["conda_envs"]["bcftools"] + shell: + op.as_one_line(""" + zcat {input.vcf} | + awk '$7 == "PASS" || $1 ~ /^#/ ' | + bcftools view -Oz -o {output.vcf} && + tabix -p vcf {output.vcf} + """) + +rule _gridss_gripss_to_bedpe: + input: + vcf = str(rules._gridss_filter_gripss.output.vcf) + output: + bedpe = CFG["dirs"]["gripss"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.bedpe" + conda: + CFG["conda_envs"]["svtools"] + shell: + op.as_one_line(""" + zcat {input.vcf} | + awk '$1 ~ /^#/ || $5 ~ /:/' | + svtools vcftobedpe | grep -v "##" > {output.bedpe} + """) + + +# Symlink the final output files into the module results directory (under '99-outputs/') +rule _gridss_output_somatic_vcf: + input: + filtered = str(rules._gridss_filter_gripss.output.vcf), + filtered_tbi = str(rules._gridss_filter_gripss.output.tbi), + somatic = str(rules._gridss_run_gripss.output.vcf), + somatic_tbi = str(rules._gridss_run_gripss.output.tbi), + bedpe = str(rules._gridss_gripss_to_bedpe.output.bedpe) + output: + somatic = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic.vcf.gz", + somatic_tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic.vcf.gz.tbi", + filtered = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.vcf.gz", + filtered_tbi = CFG["dirs"]["outputs"] + "vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.vcf.gz.tbi", + bedpe = CFG["dirs"]["outputs"] + "bedpe/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.gridss_somatic_filtered.bedpe" + run: + op.relative_symlink(input.somatic, output.somatic, in_module=True) + op.relative_symlink(input.somatic_tbi, output.somatic_tbi, in_module=True) + op.relative_symlink(input.filtered, output.filtered, in_module=True) + op.relative_symlink(input.filtered_tbi, output.filtered_tbi, in_module=True) + op.relative_symlink(input.bedpe, output.bedpe, in_module=True) + + + + +# Generates the target sentinels for each run, which generate the symlinks +rule _gridss_all: + input: + expand( + [ + str(rules._gridss_output_somatic_vcf.output.filtered), + str(rules._gridss_output_somatic_vcf.output.filtered_tbi), + str(rules._gridss_output_somatic_vcf.output.somatic), + str(rules._gridss_output_somatic_vcf.output.somatic_tbi), + str(rules._gridss_output_somatic_vcf.output.bedpe) + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"] + ) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/gridss/2.0/schemas/base-1.0.yaml b/modules/gridss/2.0/schemas/base-1.0.yaml new file mode 120000 index 000000000..0a69d1ceb --- /dev/null +++ b/modules/gridss/2.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/gridss/CHANGELOG.md b/modules/gridss/CHANGELOG.md index f8be466c5..b260775b4 100644 --- a/modules/gridss/CHANGELOG.md +++ b/modules/gridss/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to the `gridss` module will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.0] - 2021-12-29 +This release was authored by Laura Hilton. +- Implementing joint calling per patient for multi-timepoint samples. +- Module updates enable CRAM support. + ## [1.1] - 2020-10-09 This release was authored by Laura Hilton. See the [GRIDSS man page](https://github.com/PapenfussLab/gridss) for extensive documentation. - Add automatic reference file downloading from files hosted at the BCGSC [downloads page](https://bcgsc.ca/downloads/morinlab/hmftools-references/gridss/). diff --git a/modules/hmftools/1.0/hmftools.smk b/modules/hmftools/1.0/hmftools.smk index f8163d269..352ba8934 100644 --- a/modules/hmftools/1.0/hmftools.smk +++ b/modules/hmftools/1.0/hmftools.smk @@ -15,6 +15,26 @@ # Import package with useful functions for developing analysis modules import oncopipe as op +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + # Setup module and store module-specific configuration in `CFG` # `CFG` is a shortcut to `config["lcr-modules"]["hmftools"]` CFG = op.setup_module( @@ -40,23 +60,19 @@ localrules: _hmftools_all -VERSION_MAP = { +HMFTOOLS_VERSION_MAP = { "grch37": "hg19", "hs37d5": "hg19", "hg38": "hg38" } -possible_genome_builds = VERSION_MAP.keys() +possible_genome_builds = HMFTOOLS_VERSION_MAP.keys() for genome_build in CFG["runs"]["tumour_genome_build"]: assert genome_build in possible_genome_builds, ( "Samples table includes genome builds not yet compatible with this module. " "This module is currently only compatible with {possible_genome_builds}. " ) -wildcard_constraints: - genome_build = "|".join(VERSION_MAP.keys()), - pair_status = "matched|unmatched" - ##### RULES ##### @@ -69,9 +85,12 @@ rule _hmftools_input_bam: output: bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bai", + wildcard_constraints: + genome_build = "|".join(HMFTOOLS_VERSION_MAP.keys()), + pair_status = "matched|unmatched" run: - op.relative_symlink(input.bam, output.bam) - op.relative_symlink(input.bai, output.bai) + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) rule _hmftools_input_strelka: input: @@ -79,7 +98,7 @@ rule _hmftools_input_strelka: output: strelka_vcf = CFG["dirs"]["inputs"] + "strelka_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic.combined.vcf.gz" run: - op.relative_symlink(input.strelka_vcf, output.strelka_vcf) + op.absolute_symlink(input.strelka_vcf, output.strelka_vcf) rule _hmftools_input_gridss: input: @@ -93,10 +112,10 @@ rule _hmftools_input_gridss: gridss_filtered_vcf = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz", gridss_filtered_tbi = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz.tbi" run: - op.relative_symlink(input.gridss_somatic_vcf, output.gridss_somatic_vcf) - op.relative_symlink(input.gridss_somatic_tbi, output.gridss_somatic_tbi) - op.relative_symlink(input.gridss_filtered_vcf, output.gridss_filtered_vcf) - op.relative_symlink(input.gridss_filtered_tbi, output.gridss_filtered_tbi) + op.absolute_symlink(input.gridss_somatic_vcf, output.gridss_somatic_vcf) + op.absolute_symlink(input.gridss_somatic_tbi, output.gridss_somatic_tbi) + op.absolute_symlink(input.gridss_filtered_vcf, output.gridss_filtered_vcf) + op.absolute_symlink(input.gridss_filtered_tbi, output.gridss_filtered_tbi) # Rules to download and setup reference files @@ -121,7 +140,7 @@ rule _hmftools_get_cobalt_gc: gc = CFG["dirs"]["inputs"] + "references/{genome_build}/cobalt/GC_profile.1000bp.cnp" params: url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/cobalt", - alt_build = lambda w: VERSION_MAP[w.genome_build] + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build] conda: CFG["conda_envs"]["wget"] shell: @@ -133,7 +152,7 @@ rule _hmftools_get_amber_snps: snpcheck = CFG["dirs"]["inputs"] + "references/{genome_build}/amber/GermlineHetPon.snpcheck.vcf.gz" params: url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/amber", - alt_build = lambda w: VERSION_MAP[w.genome_build] + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build] conda: CFG["conda_envs"]["wget"] shell: @@ -146,7 +165,7 @@ rule _hmftools_get_purple_drivers: gene_panel = CFG["dirs"]["inputs"] + "references/{genome_build}/purple/DriverGenePanel.tsv" params: url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/purple", - alt_build = lambda w: VERSION_MAP[w.genome_build] + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build] conda: CFG["conda_envs"]["wget"] shell: @@ -493,7 +512,7 @@ rule _hmftools_linx: resources: **CFG["resources"]["linx"] params: - alt_build = lambda w: VERSION_MAP[w.genome_build], + alt_build = lambda w: HMFTOOLS_VERSION_MAP[w.genome_build], ensembl_build = lambda w: { "grch37": "HG37", "hs37d5": "HG37", @@ -628,7 +647,7 @@ rule _hmftools_purple_output: output: files = CFG["dirs"]["outputs"] + "purple_output/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.purple.{out_file}" run: - op.relative_symlink(input.files, output.files) + op.relative_symlink(input.files, output.files, in_module=True) rule _hmftools_purple_plots: input: @@ -636,7 +655,7 @@ rule _hmftools_purple_plots: output: plots = CFG["dirs"]["outputs"] + "purple_plots/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.{plot_name}.png" run: - op.relative_symlink(input.plots, output.plots) + op.relative_symlink(input.plots, output.plots, in_module=True) rule _hmftools_linx_plots: diff --git a/modules/hmftools/1.1/config/default.yaml b/modules/hmftools/1.1/config/default.yaml new file mode 100644 index 000000000..5df6e6685 --- /dev/null +++ b/modules/hmftools/1.1/config/default.yaml @@ -0,0 +1,101 @@ +lcr-modules: + + hmftools: + + # TODO: Update the list of available wildcards, if applicable + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + # Available wildcards: {seq_type} {genome_build} {tumour_id} + # Note: SLMS-3 outputs are recommended. + # The pipeline will take any VCF where the samples are labeled 'TUMOR' and 'NORMAL', + # and where the VCF is annotated with "AD" and "DP" fields. + # Must be in bgzip with `.vcf.gz` extension. + slms3_vcf: "__UPDATE__" + # Available wildcards: {seq_type} {genome_build} {sample_id} + # Note: These are output by the gripss somatic filtering step of the gridss module + gridss_somatic: "__UPDATE__" # Output of GRIPSS + gridss_somatic_tbi: "__UPDATE" + gridss_somatic_filtered: "__UPDATE__" # Filtered output of GRIPSS + gridss_somatic_filtered_tbi: "__UPDATE__" + + scratch_subdirectories: [] + + switches: + ensembl_url: + '37': "mysql://ensembldb.ensembl.org:3337/homo_sapiens_core_89_37" + '38': "mysql://ensembldb.ensembl.org:3306/homo_sapiens_core_98_38" + + options: + use_masked_ref: False + amber: + -validation_stringency SILENT + cobalt: + -validation_stringency SILENT + purple: "" + linx: "" + linx_viz: + -fusion_legend_height_per_row 70 + -segment_relative_size 0.5 + -outer_radius 0.85 + -min_line_size 4 + -max_line_size 18 + -min_label_size 45 + -max_label_size 50 + -glyph_size 25 + -exon_rank_radius 0.04 + -max_gene_characters 15 + linx_viz_annotate: + -fusion_legend_height_per_row 70 + -segment_relative_size 0.5 + -outer_radius 0.85 + -min_line_size 4 + -max_line_size 18 + -min_label_size 45 + -max_label_size 50 + -glyph_size 25 + -exon_rank_radius 0.04 + -max_gene_characters 15 + + conda_envs: + samtools: "{MODSDIR}/envs/samtools-1.9.yaml" + wget: "{MODSDIR}/envs/wget-1.20.1.yaml" + bcftools: "{MODSDIR}/envs/bcftools-1.10.2.yaml" + amber: "{MODSDIR}/envs/hmftools-amber-3.5.yaml" + cobalt: "{MODSDIR}/envs/hmftools-cobalt-1.11.yaml" + purple: "{MODSDIR}/envs/hmftools-purple-2.54.yaml" + linx: "{MODSDIR}/envs/hmftools-linx-1.15.yaml" + linx_annotate: "{MODSDIR}/envs/hmftools-linx-1.15.yaml" + snpeff: "{MODSDIR}/envs/snpeff-4.3.1t.yaml" + + threads: + vcf_sample_names: 1 + snpeff: 4 + amber: 16 + cobalt: 16 + purple: 8 + linx: 2 + linx_viz: 8 + + resources: + vcf_sample_names: + mem_mb: 1000 + snpeff: + mem_mb: 5000 + amber: + mem_mb: 36000 + cobalt: + mem_mb: 20000 + purple: + mem_mb: 20000 + linx: + mem_mb: 10000 + linx_viz: + mem_mb: 20000 + + pairing_config: + genome: + run_paired_tumours: True + run_unpaired_tumours_with: "unmatched_normal" + run_paired_tumours_as_unpaired: False diff --git a/modules/hmftools/1.1/envs/bcftools-1.10.2.yaml b/modules/hmftools/1.1/envs/bcftools-1.10.2.yaml new file mode 120000 index 000000000..72959e7bb --- /dev/null +++ b/modules/hmftools/1.1/envs/bcftools-1.10.2.yaml @@ -0,0 +1 @@ +../../../../envs/bcftools/bcftools-1.10.2.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-amber-3.4.yaml b/modules/hmftools/1.1/envs/hmftools-amber-3.4.yaml new file mode 120000 index 000000000..fac6fa12b --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-amber-3.4.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-amber-3.4.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-amber-3.5.yaml b/modules/hmftools/1.1/envs/hmftools-amber-3.5.yaml new file mode 120000 index 000000000..71dfe9fb6 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-amber-3.5.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-amber-3.5.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-cobalt-1.11.yaml b/modules/hmftools/1.1/envs/hmftools-cobalt-1.11.yaml new file mode 120000 index 000000000..d671910e9 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-cobalt-1.11.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-cobalt-1.11.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-cobalt-1.8.yaml b/modules/hmftools/1.1/envs/hmftools-cobalt-1.8.yaml new file mode 120000 index 000000000..eb143618e --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-cobalt-1.8.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-cobalt-1.8.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-cobalt-1.9.yaml b/modules/hmftools/1.1/envs/hmftools-cobalt-1.9.yaml new file mode 120000 index 000000000..8c4af7acd --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-cobalt-1.9.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-cobalt-1.9.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-linx-1.10.yaml b/modules/hmftools/1.1/envs/hmftools-linx-1.10.yaml new file mode 120000 index 000000000..6383f8839 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-linx-1.10.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-linx-1.10.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-linx-1.11.yaml b/modules/hmftools/1.1/envs/hmftools-linx-1.11.yaml new file mode 120000 index 000000000..09c92d668 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-linx-1.11.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-linx-1.11.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-linx-1.15.yaml b/modules/hmftools/1.1/envs/hmftools-linx-1.15.yaml new file mode 120000 index 000000000..f789beede --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-linx-1.15.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-linx-1.15.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.44.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.44.yaml new file mode 120000 index 000000000..43d663aa7 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.44.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.44.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.45.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.45.yaml new file mode 120000 index 000000000..dd0a26c59 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.45.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.45.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.48.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.48.yaml new file mode 120000 index 000000000..bb5438531 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.48.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.48.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/hmftools-purple-2.54.yaml b/modules/hmftools/1.1/envs/hmftools-purple-2.54.yaml new file mode 120000 index 000000000..66e18c157 --- /dev/null +++ b/modules/hmftools/1.1/envs/hmftools-purple-2.54.yaml @@ -0,0 +1 @@ +../../../../envs/hmftools/hmftools-purple-2.54.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/samtools-1.9.yaml b/modules/hmftools/1.1/envs/samtools-1.9.yaml new file mode 120000 index 000000000..ab29288bb --- /dev/null +++ b/modules/hmftools/1.1/envs/samtools-1.9.yaml @@ -0,0 +1 @@ +../../../../envs/samtools/samtools-1.9.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/snpeff-4.3.1t.yaml b/modules/hmftools/1.1/envs/snpeff-4.3.1t.yaml new file mode 120000 index 000000000..c452e525b --- /dev/null +++ b/modules/hmftools/1.1/envs/snpeff-4.3.1t.yaml @@ -0,0 +1 @@ +../../../../envs/snpeff/snpeff-4.3.1t.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/envs/wget-1.20.1.yaml b/modules/hmftools/1.1/envs/wget-1.20.1.yaml new file mode 120000 index 000000000..86501e72a --- /dev/null +++ b/modules/hmftools/1.1/envs/wget-1.20.1.yaml @@ -0,0 +1 @@ +../../../../envs/wget/wget-1.20.1.yaml \ No newline at end of file diff --git a/modules/hmftools/1.1/hmftools.smk b/modules/hmftools/1.1/hmftools.smk new file mode 100644 index 000000000..db45ff9e3 --- /dev/null +++ b/modules/hmftools/1.1/hmftools.smk @@ -0,0 +1,629 @@ +#!/usr/bin/env snakemake + + +##### ATTRIBUTION ##### + + +# Original Author: Laura Hilton +# Module Author: Laura Hilton +# Contributors: N/A + + +##### SETUP ##### + + +# Import package with useful functions for developing analysis modules +import oncopipe as op + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +# Setup module and store module-specific configuration in `CFG` +# `CFG` is a shortcut to `config["lcr-modules"]["hmftools"]` +CFG = op.setup_module( + name = "hmftools", + version = "1.1", + subdirectories = ["inputs", "prepare_slms3", "amber", "cobalt", "purple", "linx", "outputs"], +) + +# Define rules to be run locally when using a compute cluster +localrules: + _hmftools_input_bam, + _hmftools_input_slms3, + _hmftools_slms3_sample_names, + _hmftools_input_gridss, + _hmftools_input_references, + _hmftools_get_cobalt_gc, + _hmftools_get_cobalt_bed, + _hmftools_get_amber_snps, + _hmftools_get_purple_drivers, + _hmftools_get_linx_db, + _hmftools_get_ensembl_cache, + _hmftools_purple_output, + _hmftools_purple_plots, + _hmftools_all + + +VERSION_MAP_HMFTOOLS = { + "grch37": "37", + "hs37d5": "37", + "hg38": "38" +} + +possible_genome_builds = VERSION_MAP_HMFTOOLS.keys() +for genome_build in CFG["runs"]["tumour_genome_build"]: + assert genome_build in possible_genome_builds, ( + "Samples table includes genome builds not yet compatible with this module. " + "This module is currently only compatible with {possible_genome_builds}. " + ) + + +masked_string = "" +if CFG["options"]["use_masked_ref"]: + masked_string = "_masked" + + +##### RULES ##### + + +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _hmftools_input_bam: + input: + bam = CFG["inputs"]["sample_bam"], + bai = CFG["inputs"]["sample_bai"], + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bai", + group: "input_and_vcf" + wildcard_constraints: + genome_build = "|".join(VERSION_MAP_HMFTOOLS.keys()), + pair_status = "matched|unmatched" + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + +rule _hmftools_input_slms3: + input: + vcf = CFG["inputs"]["slms3_vcf"], + output: + vcf = CFG["dirs"]["inputs"] + "slms3_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/slms3.vcf.gz" + group: "input_and_vcf" + run: + op.relative_symlink(input.vcf, output.vcf) + +rule _hmftools_input_gridss: + input: + gridss_somatic_vcf = CFG["inputs"]["gridss_somatic"], + gridss_somatic_tbi = CFG["inputs"]["gridss_somatic_tbi"], + gridss_filtered_vcf = CFG["inputs"]["gridss_somatic_filtered"], + gridss_filtered_tbi = CFG["inputs"]["gridss_somatic_filtered_tbi"] + output: + gridss_somatic_vcf = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz", + gridss_somatic_tbi = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic.vcf.gz.tbi", + gridss_filtered_vcf = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz", + gridss_filtered_tbi = CFG["dirs"]["inputs"] + "gridss_vcf/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/gridss_somatic_filtered.vcf.gz.tbi" + group: "input_and_vcf" + run: + op.absolute_symlink(input.gridss_somatic_vcf, output.gridss_somatic_vcf) + op.absolute_symlink(input.gridss_somatic_tbi, output.gridss_somatic_tbi) + op.absolute_symlink(input.gridss_filtered_vcf, output.gridss_filtered_vcf) + op.absolute_symlink(input.gridss_filtered_tbi, output.gridss_filtered_tbi) + +# Rules to download and setup reference files + +rule _hmftools_input_references: + input: + genome_fa = reference_files("genomes/{genome_build}" + masked_string + "/genome_fasta/genome.fa"), + genome_fai = reference_files("genomes/{genome_build}" + masked_string + "/genome_fasta/genome.fa.fai"), + genome_dict = reference_files("genomes/{genome_build}" + masked_string + "/genome_fasta/genome.dict") + output: + genome_fa = CFG["dirs"]["inputs"] + "references/{genome_build}" + masked_string + "/genome_fa/genome.fa", + genome_fai = CFG["dirs"]["inputs"] + "references/{genome_build}" + masked_string + "/genome_fa/genome.fa.fai", + genome_dict = CFG["dirs"]["inputs"] + "references/{genome_build}" + masked_string + "/genome_fa/genome.dict" + shell: + op.as_one_line(""" + ln -s {input.genome_fa} {output.genome_fa} && + ln -s {input.genome_fai} {output.genome_fai} && + ln -s {input.genome_dict} {output.genome_dict} + """) + +rule _hmftools_get_cobalt_gc: + output: + gc = CFG["dirs"]["inputs"] + "references/{genome_build}/cobalt/GC_profile.1000bp.cnp" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/cobalt", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.gc} {params.url}/GC_profile.1000bp.{params.alt_build}.cnp' + +rule _hmftools_get_cobalt_bed: + output: + bed = CFG["dirs"]["inputs"] + "references/{genome_build}/cobalt/DiploidRegions.bed" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/cobalt", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.bed} {params.url}/DiploidRegions.{params.alt_build}.bed' + +rule _hmftools_get_amber_snps: + output: + vcf = CFG["dirs"]["inputs"] + "references/{genome_build}/amber/GermlineHetPon.vcf.gz", + snpcheck = CFG["dirs"]["inputs"] + "references/{genome_build}/amber/Amber.snpcheck.vcf.gz" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/amber", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.vcf} {params.url}/GermlineHetPon.{params.alt_build}.vcf.gz; ' + 'wget -O {output.snpcheck} {params.url}/Amber.snpcheck.{params.alt_build}.vcf' + +rule _hmftools_get_purple_drivers: + output: + hotspots = CFG["dirs"]["inputs"] + "references/{genome_build}/purple/KnownHotspots.vcf.gz", + gene_panel = CFG["dirs"]["inputs"] + "references/{genome_build}/purple/DriverGenePanel.tsv" + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/purple", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.hotspots} {params.url}/KnownHotspots.somatic.{params.alt_build}.vcf.gz && ' + 'wget -O {output.hotspots}.tbi {params.url}/KnownHotspots.somatic.{params.alt_build}.vcf.gz.tbi && ' + 'wget -O {output.gene_panel} {params.url}/DriverGenePanel.{params.alt_build}.tsv' + +rule _hmftools_get_linx_db: + output: + directory(CFG["dirs"]["inputs"] + "references/{genome_build}/linx_db") + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/linx/Linx", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -r -np -nd -P {output} -A .bed,.csv {params.url}/{params.alt_build} && ' + 'wget -O {output}/viral_host_ref.csv {params.url}/viral_host_ref.csv' + +rule _hmftools_get_ensembl_cache: + output: + cache = directory(CFG["dirs"]["inputs"] + "references/{genome_build}/ensembl_cache/"), + complete = touch(CFG["dirs"]["inputs"] + "references/{genome_build}/ensembl_cache/cache.complete") + params: + url = "www.bcgsc.ca/downloads/morinlab/hmftools-references/ensembl_data_cache", + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["wget"] + shell: + 'wget -O {output.cache}/{params.alt_build}.zip {params.url}/{params.alt_build}.zip && ' + 'unzip -d {output.cache} {output.cache}/{params.alt_build}.zip' + +# Prepare SLMS-3 VCF files for use with PURPLE +# SnpEff annotation enables driver discovery logic + +rule _hmftools_slms3_sample_names: + input: + vcf = rules._hmftools_input_slms3.output.vcf + output: + vcf = temp(CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/tmp.slms3.vcf") + log: CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/vcf_sample_names.log" + conda: + CFG["conda_envs"]["bcftools"] + threads: CFG["threads"]["vcf_sample_names"] + resources: + **CFG["resources"]["vcf_sample_names"] + group: "input_and_vcf" + shell: + op.as_one_line(""" + bcftools view -Ov {input.vcf} | + sed 's/TUMOR/{wildcards.tumour_id}/g' | + sed 's/NORMAL/{wildcards.normal_id}/g' + > {output.vcf} + """) + +rule _hmftools_snpeff_vcf: + input: + vcf = str(rules._hmftools_slms3_sample_names.output.vcf) + output: + sample_key = temp(CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/sample_key.txt"), + vcf = temp(CFG["dirs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/slms3.snpeff.vcf.gz") + resources: + **CFG["resources"]["snpeff"] + params: + snpeff_build = lambda w: { + "grch37": "GRCh37.75", + "hs37d5": "GRCh37.75", + "hg38": "hg38" + }[w.genome_build], + config = "$(readlink -e $(which snpEff)).config", + mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8) + log: + CFG["logs"]["prepare_slms3"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/snpeff_slms3.log" + conda: + CFG["conda_envs"]["snpeff"] + threads: + CFG["threads"]["snpeff"] + shell: + op.as_one_line(""" + printf "{wildcards.normal_id}\t{wildcards.tumour_id}\n" > {output.sample_key} && + snpEff -Xmx{params.mem_mb}m + -c {params.config} -noStats + -cancer -cancerSamples {output.sample_key} + {params.snpeff_build} {input.vcf} | + bcftools view -Oz -o {output.vcf} - && + bcftools index -t {output.vcf} + """) + + +# Run AMBER to calculate BAFs +rule _hmftools_amber_matched: + input: + tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + snps = str(rules._hmftools_get_amber_snps.output.vcf), + fasta = str(rules._hmftools_input_references.output.genome_fa) + output: + vcf = CFG["dirs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.amber.baf.vcf.gz" + resources: + **CFG["resources"]["amber"] + params: + options = CFG["options"]["amber"], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8) + log: CFG["logs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/amber.log" + wildcard_constraints: + pair_status = "matched" + conda: + CFG["conda_envs"]["amber"] + threads: + CFG["threads"]["amber"] + shell: + op.as_one_line(""" + AMBER -Xmx{params.jvmheap}m + -reference {wildcards.normal_id} -reference_bam {input.normal_bam} + -tumor {wildcards.tumour_id} -tumor_bam {input.tumour_bam} + -output_dir `dirname {output.vcf}` + -threads {threads} + -loci {input.snps} + -ref_genome {input.fasta} + {params.options} + 2>&1 | tee -a {log} + """) + +rule _hmftools_amber_unmatched: + input: + tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + snps = str(rules._hmftools_get_amber_snps.output.vcf), + fasta = str(rules._hmftools_input_references.output.genome_fa) + output: + vcf = CFG["dirs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.amber.baf.vcf.gz" + resources: + **CFG["resources"]["amber"] + params: + options = CFG["options"]["amber"], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8) + log: CFG["logs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/amber.log" + wildcard_constraints: + pair_status = "unmatched" + conda: + CFG["conda_envs"]["amber"] + threads: + CFG["threads"]["amber"] + shell: + op.as_one_line(""" + AMBER -Xmx{params.jvmheap}m + -tumor_only + -tumor {wildcards.tumour_id} -tumor_bam {input.tumour_bam} + -output_dir `dirname {output.vcf}` + -threads {threads} + -loci {input.snps} + -ref_genome {input.fasta} + {params.options} + 2>&1 | tee -a {log} + """) + +# Run COBALT to estimate depth across the genome +rule _hmftools_cobalt: + input: + tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", + normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", + gc_profile = str(rules._hmftools_get_cobalt_gc.output.gc), + fasta = str(rules._hmftools_input_references.output.genome_fa) + output: + tumour_ratio = CFG["dirs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.cobalt.ratio.pcf", + normal_ratio = CFG["dirs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{normal_id}.cobalt.ratio.pcf", + tumour_tsv = temp(CFG["dirs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.cobalt.ratio.tsv"), + log: ratio = CFG["logs"]["cobalt"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/cobalt.log" + resources: + **CFG["resources"]["cobalt"] + params: + options = CFG["options"]["cobalt"], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8) + wildcard_constraints: + pair_status = "matched|unmatched" + conda: + CFG["conda_envs"]["cobalt"] + threads: + CFG["threads"]["cobalt"] + shell: + op.as_one_line(""" + COBALT -Xmx{params.jvmheap}m + -reference {wildcards.normal_id} -reference_bam {input.normal_bam} + -tumor {wildcards.tumour_id} -tumor_bam {input.tumour_bam} + -ref_genome {input.fasta} + -output_dir `dirname {output.tumour_ratio}` + -threads {threads} + -gc_profile {input.gc_profile} + {params.options} + 2>&1 | tee -a {log} + """) + + +# Run PURPLE for final CNV calling + +# Define variables for output file names +purple_out = [ + "purity.tsv", + "purity.range.tsv", + "cnv.gene.tsv", + "sv.vcf.gz", +] +purple_plots = [ + "circos", + "input", + "map", + "purity.range", + "segment" + ] + +rule _hmftools_purple_matched: + input: + amber = CFG["dirs"]["amber"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.amber.baf.vcf.gz", + cobalt_tumour = str(rules._hmftools_cobalt.output.tumour_ratio), + cobalt_normal = str(rules._hmftools_cobalt.output.normal_ratio), + cobalt_tumour_tsv = str(rules._hmftools_cobalt.output.tumour_tsv), + slms3_vcf = str(rules._hmftools_snpeff_vcf.output.vcf), + gridss_somatic_vcf = str(rules._hmftools_input_gridss.output.gridss_somatic_vcf), + gridss_filtered_vcf = str(rules._hmftools_input_gridss.output.gridss_filtered_vcf), + reference_fa = str(rules._hmftools_input_references.output.genome_fa), + gene_panel = str(rules._hmftools_get_purple_drivers.output.gene_panel), + hotspots = str(rules._hmftools_get_purple_drivers.output.hotspots), + gc_profile = str(rules._hmftools_get_cobalt_gc.output.gc) + output: + files = expand(CFG["dirs"]["purple"] + "{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}/{{tumour_id}}.purple.{out_file}", + out_file = purple_out), + plots = expand(CFG["dirs"]["purple"] + "{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}/plot/{{tumour_id}}.{plot_name}.png", + plot_name = purple_plots) + log: CFG["logs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/purple.log" + resources: + **CFG["resources"]["purple"] + params: + outdir = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}", + options = CFG["options"]["purple"], + circos = "`which circos`", + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.9) + wildcard_constraints: + pair_status = "matched|unmatched", + out_file = "|".join(purple_out), + plot_name = "|".join(purple_plots) + conda: + CFG["conda_envs"]["purple"] + threads: + CFG["threads"]["purple"] + shell: + op.as_one_line(""" + PURPLE -Xmx{params.jvmheap}m -driver_catalog + -reference {wildcards.normal_id} + -tumor {wildcards.tumour_id} + -output_dir {params.outdir} + -amber `dirname {input.amber}` + -cobalt `dirname {input.cobalt_tumour}` + -gc_profile {input.gc_profile} + -ref_genome {input.reference_fa} + -somatic_hotspots {input.hotspots} + -driver_gene_panel {input.gene_panel} + -somatic_vcf {input.slms3_vcf} + -structural_vcf {input.gridss_filtered_vcf} + -sv_recovery_vcf {input.gridss_somatic_vcf} + -circos {params.circos} + {params.options} + -threads {threads} + 2>&1 | tee -a {log} + """) + + + + +# Run LINX to cluster and visualize CNV and SV data +rule _hmftools_linx: + input: + purple_vcf = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.purple.sv.vcf.gz", + ensembl_cache = str(rules._hmftools_get_ensembl_cache.output.cache), + linx_db = str(rules._hmftools_get_linx_db.output) + output: + clusters = CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.linx.vis_sv_data.tsv", + svs = CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.linx.svs.tsv" + log: CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/linx.log" + resources: + **CFG["resources"]["linx"] + params: + ref_genome_version = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build], + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8), + options = CFG["options"]["linx"], + cache_subdir = lambda w: config["lcr-modules"]["hmftools"]["dirs"]["inputs"] + "references/" + w.genome_build + "/ensembl_cache/" + VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["linx"] + threads: + CFG["threads"]["linx"] + shell: + op.as_one_line(""" + linx -Xmx{params.jvmheap}m + -sample {wildcards.tumour_id} + -ref_genome_version {params.ref_genome_version} + -sv_vcf {input.purple_vcf} + -purple_dir `dirname {input.purple_vcf}` + -output_dir `dirname {output.clusters}` + -gene_transcripts_dir {params.cache_subdir} + -fragile_site_file {input.linx_db}/fragile_sites_hmf.{params.ref_genome_version}.csv + -line_element_file {input.linx_db}/line_elements.{params.ref_genome_version}.csv + -viral_hosts_file {input.linx_db}/viral_host_ref.csv + -known_fusion_file {input.linx_db}/known_fusion_data.{params.ref_genome_version}.csv + -check_fusions + -check_drivers + -write_vis_data + {params.options} + 2>&1 | tee -a {log} + """) + +rule _hmftools_linx_viz: + input: + clusters = rules._hmftools_linx.output.clusters, + svs = rules._hmftools_linx.output.svs, + ensembl_cache = str(rules._hmftools_get_ensembl_cache.output.cache) + output: + plots = directory(CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot"), + data = directory(CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/data") + log: CFG["logs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/linx_viz.log" + resources: + **CFG["resources"]["linx_viz"] + params: + linx_jar = "$(ls $(dirname $(readlink -e $(which linx)))/*.jar)", + circos = "$(which circos)", + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8), + options = CFG["options"]["linx_viz"], + cache_subdir = lambda w: config["lcr-modules"]["hmftools"]["dirs"]["inputs"] + "references/" + w.genome_build + "/ensembl_cache/" + VERSION_MAP_HMFTOOLS[w.genome_build], + alt_build = lambda w: VERSION_MAP_HMFTOOLS[w.genome_build] + conda: + CFG["conda_envs"]["linx"] + threads: + CFG["threads"]["linx_viz"] + + shell: + op.as_one_line(""" + to_plot=$(dirname {input.svs})/to_plot.tsv; + tail -n +2 {input.svs} | awk '{{FS=OFS="\\t"}} $4 != "" {{print $3}}' | sort | uniq > $to_plot; + if [[ $(cat $to_plot | wc -l) -lt 50 ]]; then + cat $to_plot | while read cluster; do + java -Xmx{params.jvmheap}m -cp {params.linx_jar} com.hartwig.hmftools.linx.visualiser.SvVisualiser + -sample {wildcards.tumour_id} + -ref_genome_version V{params.alt_build} + -gene_transcripts_dir {params.cache_subdir} + -plot_out {output.plots} + -data_out {output.data} + -vis_file_dir $(dirname {input.clusters}) + -circos {params.circos} + -threads {threads} + -clusterId $cluster + -plot_cluster_genes + 2>&1 | tee -a {log}; + done; + else + echo "Too many clusters to plot for {wildcards.tumour_id}--{wildcards.normal_id}--{wildcards.pair_status}. See chromosome outputs and consider manually selecting clusters to plot. " 2>&1 | tee -a {log}; + fi; + for chrom in $(tail -n +2 {input.clusters} | cut -f8 | sort | uniq); do + java -Xmx{params.jvmheap}m -cp {params.linx_jar} com.hartwig.hmftools.linx.visualiser.SvVisualiser + -sample {wildcards.tumour_id} + -ref_genome_version V{params.alt_build} + -gene_transcripts_dir {params.cache_subdir} + -plot_out {output.plots} + -data_out {output.data} + -vis_file_dir $(dirname {input.clusters}) + -circos {params.circos} + -threads {threads} + -chromosome ${{chrom}} + 2>&1 | tee -a {log}; + done + """) + + + + +# Symlinks the final output files into the module results directory (under '99-outputs/') + +rule _hmftools_purple_output: + input: + files = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.purple.{out_file}" + output: + files = CFG["dirs"]["outputs"] + "purple_output/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.purple.{out_file}" + wildcard_constraints: + out_file = "|".join(purple_out) + run: + op.relative_symlink(input.files, output.files, in_module=True) + +rule _hmftools_purple_plots: + input: + plots = CFG["dirs"]["purple"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot/{tumour_id}.{plot_name}.png" + output: + plots = CFG["dirs"]["outputs"] + "purple_plots/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.{plot_name}.png" + wildcard_constraints: + plot_name = "|".join(purple_plots) + run: + op.relative_symlink(input.plots, output.plots, in_module=True) + + +rule _hmftools_linx_plots: + input: + plots = CFG["dirs"]["linx"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/plot", + output: + plots = CFG["dirs"]["outputs"] + "linx_plots/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.symlinked" + shell: + op.as_one_line(""" + workdir=$PWD && + cd `dirname {output.plots}` && + find $workdir/{input.plots} -type f -name "*.png" -exec cp -s {{}} . \; && + touch $workdir/{output.plots} && + cd $workdir + """) + +rule _hmftools_dispatch: + input: + files = expand(CFG["dirs"]["outputs"] + "purple_output/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.purple.{out_file}", + out_file = purple_out), + plots = expand(CFG["dirs"]["outputs"] + "purple_plots/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{plot_name}.png", + plot_name = purple_plots), + linx = rules._hmftools_linx_plots.output.plots + output: + dispatched = touch(CFG["dirs"]["outputs"] + "dispatched/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}.dispatched") + + +# Generates the target sentinels for each run, which generate the symlinks +rule _hmftools_all: + input: + expand( + [ + str(rules._hmftools_dispatch.output.dispatched), + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_id=CFG["runs"]["normal_sample_id"], + pair_status=CFG["runs"]["pair_status"]) + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/hmftools/1.1/schemas/base-1.0.yaml b/modules/hmftools/1.1/schemas/base-1.0.yaml new file mode 120000 index 000000000..0a69d1ceb --- /dev/null +++ b/modules/hmftools/1.1/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/hmftools/CHANGELOG.md b/modules/hmftools/CHANGELOG.md index 7239dbcac..1c038d336 100644 --- a/modules/hmftools/CHANGELOG.md +++ b/modules/hmftools/CHANGELOG.md @@ -5,6 +5,10 @@ All notable changes to the `hmftools` module will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1] - 2021-12-29 + +- Updates to the GRIDSS-PURPLE-LINX pipeline, incorporating new and better ways of handling unmatched tumours. + ## [1.0] - 2020-07-29 This release was authored by Laura Hilton. diff --git a/modules/ichorcna/1.0/config/default.yaml b/modules/ichorcna/1.0/config/default.yaml new file mode 100644 index 000000000..37a33898f --- /dev/null +++ b/modules/ichorcna/1.0/config/default.yaml @@ -0,0 +1,107 @@ +lcr-modules: + + ichorcna: + + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + + + scratch_subdirectories: [] + + options: + readcounter: + qual: 20 # only includes reads with mapping quality greater than 20 + binSize: 1000000 # set window size to compute coverage + # available binSizes are: 1000000, 500000, 50000, 10000 + run: + ichorCNA_libdir: "" + ichorCNA_rscript: "{MODSDIR}/src/runIchorCNA.R" + # use panel matching same bin size (optional) + ichorCNA_normalPanel: + "1000000": "inst/extdata/HD_ULP_PoN_{genome_build}_1Mb_median_normAutosome_median.rds" + "500000": "inst/extdata/HD_ULP_PoN_{genome_build}_500kb_median_normAutosome_median.rds" + # must use gc wig file corresponding to same binSize (required) + ichorCNA_gcWig: + "1000000": "inst/extdata/gc_{genome_build}_1000kb.wig" + "500000": "inst/extdata/gc_{genome_build}_500kb.wig" + "50000": "inst/extdata/gc_{genome_build}_50kb.wig" + "10000": "inst/extdata/gc_{genome_build}_10kb.wig" + # must use map wig file corresponding to same binSize (required) + ichorCNA_mapWig: + "1000000": "inst/extdata/map_{genome_build}_1000kb.wig" + "500000": "inst/extdata/map_{genome_build}_500kb.wig" + "50000": "inst/extdata/map_{genome_build}_50kb.wig" + "10000": "inst/extdata/map_{genome_build}_10kb.wig" + # use bed file if sample has targeted regions, eg. exome data (optional) + ichorCNA_exons: NULL + ichorCNA_centromere: + grch37: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hg19: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hs37d5: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + grch38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + hg38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + ichorCNA_minMapScore: 0.75 + ichorCNA_fracReadsInChrYForMale: 0.002 # Threshold for fraction of reads in chrY to assign as male + ichorCNA_genomeStyle: # can set this to UCSC or NCBI + grch37: "NCBI" + hg19: "NCBI" + hs37d5: "NCBI" + grch38: "UCSC" + hg38: "UCSC" + # chrs used for training ichorCNA parameters, e.g. tumor fraction. + ichorCNA_chrTrain: + grch37: "c(1:22)" + hg19: "c(1:22)" + hs37d5: "c(1:22)" + grch38: "paste0('chr', c(1:22))" + hg38: "paste0('chr', c(1:22))" + # non-tumor fraction parameter restart values; higher values should be included for cfDNA + ichorCNA_normal: "c(0.5,0.6,0.7,0.8,0.9,0.95)" + # ploidy parameter restart values + ichorCNA_ploidy: "c(2,3,4)" + ichorCNA_estimateNormal: TRUE + ichorCNA_estimatePloidy: TRUE + ichorCNA_estimateClonality: TRUE + # states to use for subclonal CN + ichorCNA_scStates: "c(1,3)" + # set maximum copy number to use + ichorCNA_maxCN: 5 + # TRUE/FALSE to include homozygous deletion state # FALSE for low coverage libraries (ex. 0.1x) ; can turn on for higher coverage data (ex. >10x) + ichorCNA_includeHOMD: FALSE + # Exclude solutions if total length of subclonal CNAs > this fraction of the genome + ichorCNA_maxFracGenomeSubclone: 0.5 + # Exclude solutions if total length of subclonal CNAs > this fraction of total CNA length + ichorCNA_maxFracCNASubclone: 0.7 + # control segmentation - higher (e.g. 0.9999999) leads to higher specificity and fewer segments + # lower (e.g. 0.99) leads to higher sensitivity and more segments + ichorCNA_txnE: 0.9399999 + # control segmentation - higher (e.g. 10000000) leads to higher specificity and fewer segments + # lower (e.g. 100) leads to higher sensitivity and more segments + ichorCNA_txnStrength: 10000 + ichorCNA_plotFileType: "pdf" + ichorCNA_plotYlim: "c(-2,2)" + + + conda_envs: + ichorcna: "{MODSDIR}/envs/ichorcna.env.yaml" + hmmcopy_utils: "{MODSDIR}/envs/hmmcopy_utils.env.yaml" + + threads: + readcounter: 4 + run: 4 + + resources: + readcounter: + mem_mb: 2000 + bam: 1 + run: + mem_mb: 2000 + bam: 1 + + pairing_config: + genome: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: True diff --git a/modules/ichorcna/1.0/envs/ichorcna.env.yaml b/modules/ichorcna/1.0/envs/ichorcna.env.yaml new file mode 100644 index 000000000..208fd501c --- /dev/null +++ b/modules/ichorcna/1.0/envs/ichorcna.env.yaml @@ -0,0 +1,108 @@ +name: null +channels: + - conda-forge + - dranew + - bioconda + - defaults + - r +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-genomeinfodb=1.26.0 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicranges=1.42.0 + - bioconductor-hmmcopy=1.32.0 + - bioconductor-iranges=2.24.0 + - bioconductor-s4vectors=0.28.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - ca-certificates=2020.12.5 + - cairo=1.16.0 + - curl=7.71.1 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.0 + - hmmcopy_utils=0.0.1 + - icu=68.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.2 + - ld_impl_linux-64=2.35.1 + - libblas=3.8.0 + - libcblas=3.8.0 + - libcurl=7.71.1 + - libedit=3.1.20191231 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.66.7 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.8.0 + - libopenblas=0.3.10 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.13 + - libxml2=2.9.10 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1j + - pango=1.42.4 + - pcre=8.44 + - pcre2=10.36 + - pixman=0.40.0 + - pthread-stubs=0.4 + - r-base=4.0.3 + - r-bitops=1.0_6 + - r-data.table=1.14.0 + - r-getopt=1.20.3 + - r-ichorcna=0.2.0 + - r-optparse=1.6.6 + - r-plyr=1.8.6 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.2 + - readline=8.0 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.0 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.9 +prefix: /projects/rmorin/projects/tumour_contam/envs/ichorcna diff --git a/modules/ichorcna/1.0/ichorcna.smk b/modules/ichorcna/1.0/ichorcna.smk new file mode 100644 index 000000000..85750346b --- /dev/null +++ b/modules/ichorcna/1.0/ichorcna.smk @@ -0,0 +1,334 @@ +#!/usr/bin/env snakemake + + +# ---------------------------------------------------------------------------- # +##### ATTRIBUTION ##### +# ---------------------------------------------------------------------------- # + +# Original snakemake author: Jasper Wong +# Module author: Jasper Wong +# Additional contributors: N/A + + +# ---------------------------------------------------------------------------- # +##### SETUP ##### +# ---------------------------------------------------------------------------- # + +### Modules ### + +import pandas as pd +import numpy as np +import oncopipe as op +import glob +import os + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +### Directories ### +# Setup module and store module-specific configuration in `CFG`. +CFG = op.setup_module( + name = "ichorcna", + version = "1.0", + subdirectories = ["inputs", "readDepth", "seg", "outputs"] +) + +localrules: + _ichorcna_input_bam, + _ichorcna_output, + _ichorcna_all + +# ---------------------------------------------------------------------------- # +##### RULES ##### +# ---------------------------------------------------------------------------- # + +### Set-up dependencies and packages ### +# Download github and all external files for ichorCNA: (needed since their extdata is not complete for all genome builds) +rule _install_ichorcna: + output: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + outdir = CFG["dirs"]["inputs"] + "ichorCNA/" + conda: + CFG["conda_envs"]["ichorcna"] + shell: + op.as_one_line(""" + git clone git://github.com/broadinstitute/ichorCNA.git {params.outdir} && + touch {output.complete}""") + +# This defines the script/extdata directory used by ichorCNA in the subsequent rules: +ichorDir = CFG["dirs"]["inputs"] + "ichorCNA/inst/extdata/" + +# Symlinks the extdata appropriately +rule _setup_ichorcna_extdata: + input: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_1Mb_median_normAutosome_mapScoreFiltered_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_500kb_median_normAutosome_mapScoreFiltered_median.rds", + hg38_1Mb_rds = ichorDir + "HD_ULP_PoN_hg38_1Mb_median_normAutosome_median.rds", + hg38_500kb_rds = ichorDir + "HD_ULP_PoN_hg38_500kb_median_normAutosome_median.rds", + hg19_1000kb_gc = ichorDir + "gc_hg19_1000kb.wig", + hg19_500kb_gc = ichorDir + "gc_hg19_500kb.wig", + hg19_50kb_gc = ichorDir + "gc_hg19_50kb.wig", + hg19_10kb_gc = ichorDir + "gc_hg19_10kb.wig", + hg38_1000kb_gc = ichorDir + "gc_hg38_1000kb.wig", + hg38_500kb_gc = ichorDir + "gc_hg38_500kb.wig", + hg38_50kb_gc = ichorDir + "gc_hg38_50kb.wig", + hg38_10kb_gc = ichorDir + "gc_hg38_10kb.wig", + hg19_1000kb_map = ichorDir + "map_hg19_1000kb.wig", + hg19_500kb_map = ichorDir + "map_hg19_500kb.wig", + hg19_50kb_map = ichorDir + "map_hg19_50kb.wig", + hg19_10kb_map = ichorDir + "map_hg19_10kb.wig", + hg38_1000kb_map = ichorDir + "map_hg38_1000kb.wig", + hg38_500kb_map = ichorDir + "map_hg38_500kb.wig", + hg38_50kb_map = ichorDir + "map_hg38_50kb.wig", + hg38_10kb_map = ichorDir + "map_hg38_10kb.wig", + output: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_hg19_1Mb_median_normAutosome_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_hg19_500kb_median_normAutosome_median.rds", + grch37_1Mb_rds = ichorDir + "HD_ULP_PoN_grch37_1Mb_median_normAutosome_median.rds", + grch37_500kb_rds = ichorDir + "HD_ULP_PoN_grch37_500kb_normAutosome_median.rds", + hs37d5_1Mb_rds = ichorDir + "HD_ULP_PoN_hs37d5_1Mb_median_normAutosome_median.rds", + hs37d5_500kb_rds = ichorDir + "HD_ULP_PoN_hs37d5_500kb_normAutosome_median.rds", + grch38_1Mb_rds = ichorDir + "HD_ULP_PoN_grch38_1Mb_median_normAutosome_median.rds", + grch38_500kb_rds = ichorDir + "HD_ULP_PoN_grch38_500kb_median_normAutosome_median.rds", + grch37_1000kb_gc = ichorDir + "gc_grch37_1000kb.wig", + grch37_500kb_gc = ichorDir + "gc_grch37_500kb.wig", + grch37_50kb_gc = ichorDir + "gc_grch37_50kb.wig", + grch37_10kb_gc = ichorDir + "gc_grch37_10kb.wig", + hs37d5_1000kb_gc = ichorDir + "gc_hs37d5_1000kb.wig", + hs37d5_500kb_gc = ichorDir + "gc_hs37d5_500kb.wig", + hs37d5_50kb_gc = ichorDir + "gc_hs37d5_50kb.wig", + hs37d5_10kb_gc = ichorDir + "gc_hs37d5_10kb.wig", + grch38_1000kb_gc = ichorDir + "gc_grch38_1000kb.wig", + grch38_500kb_gc = ichorDir + "gc_grch38_500kb.wig", + grch38_50kb_gc = ichorDir + "gc_grch38_50kb.wig", + grch38_10kb_gc = ichorDir + "gc_grch38_10kb.wig", + grch37_1000kb_map = ichorDir + "map_grch37_1000kb.wig", + grch37_500kb_map = ichorDir + "map_grch37_500kb.wig", + grch37_50kb_map = ichorDir + "map_grch37_50kb.wig", + grch37_10kb_map = ichorDir + "map_grch37_10kb.wig", + hs37d5_1000kb_map = ichorDir + "map_hs37d5_1000kb.wig", + hs37d5_500kb_map = ichorDir + "map_hs37d5_500kb.wig", + hs37d5_50kb_map = ichorDir + "map_hs37d5_50kb.wig", + hs37d5_10kb_map = ichorDir + "map_hs37d5_10kb.wig", + grch38_1000kb_map = ichorDir + "map_grch38_1000kb.wig", + grch38_500kb_map = ichorDir + "map_grch38_500kb.wig", + grch38_50kb_map = ichorDir + "map_grch38_50kb.wig", + grch38_10kb_map = ichorDir + "map_grch38_10kb.wig", + complete = touch(ichorDir + "symlink.done") + run: + op.relative_symlink(params.hg19_1Mb_rds, output.hg19_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hg19_500kb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.grch37_1Mb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.hs37d5_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.grch37_500kb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hs37d5_500kb_rds) + op.relative_symlink(params.hg38_1Mb_rds, output.grch38_1Mb_rds) + op.relative_symlink(params.hg38_500kb_rds, output.grch38_500kb_rds) + op.relative_symlink(params.hg19_1000kb_gc, output.grch37_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.grch37_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.grch37_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.grch37_10kb_gc) + op.relative_symlink(params.hg19_1000kb_gc, output.hs37d5_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.hs37d5_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.hs37d5_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.hs37d5_10kb_gc) + op.relative_symlink(params.hg38_1000kb_gc, output.grch38_1000kb_gc) + op.relative_symlink(params.hg38_500kb_gc, output.grch38_500kb_gc) + op.relative_symlink(params.hg38_50kb_gc, output.grch38_50kb_gc) + op.relative_symlink(params.hg38_10kb_gc, output.grch38_10kb_gc) + op.relative_symlink(params.hg19_1000kb_map, output.grch37_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.grch37_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.grch37_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.grch37_10kb_map) + op.relative_symlink(params.hg19_1000kb_map, output.hs37d5_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.hs37d5_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.hs37d5_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.hs37d5_10kb_map) + op.relative_symlink(params.hg38_1000kb_map, output.grch38_1000kb_map) + op.relative_symlink(params.hg38_500kb_map, output.grch38_500kb_map) + op.relative_symlink(params.hg38_50kb_map, output.grch38_50kb_map) + op.relative_symlink(params.hg38_10kb_map, output.grch38_10kb_map) + +### Run ichorCNA ### +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _ichorcna_input_bam: + input: + bam = CFG["inputs"]["sample_bam"], + bai = CFG["inputs"]["sample_bai"] + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", # specific to readCounter + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + +# This function will return a comma-separated list of chromosomes to include in readCounter +def get_chromosomes(wildcards): + chromosomes=[] + for i in range(1,23): + chromosomes.append(str(i)) + chromosomes.append("X") + chromosomes.append("Y") + if "38" in str(wildcards.genome_build): + chromosomes = ["chr" + x for x in chromosomes] + chromosomes= ",".join(chromosomes) + return chromosomes + +rule _ichorcna_read_counter: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + ichorcna_package = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success", + symlink_complete = ichorDir + "symlink.done" + output: + CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/{sample_id}.bin{binSize}.wig" + params: + binSize = CFG["options"]["readcounter"]["binSize"], + qual = CFG["options"]["readcounter"]["qual"], + chrs = get_chromosomes + conda: CFG["conda_envs"]["ichorcna"] + threads: CFG["threads"]["readcounter"] + resources: + **CFG["resources"]["readcounter"] + log: + CFG["logs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/{sample_id}.bin{binSize}.log" + shell: + "readCounter {input.bam} -c {params.chrs} -w {params.binSize} -q {params.qual} > {output} 2> {log}" + + +# This function will return a comma-separated list of chromosomes to include in runIchorCNA +def get_chromosomes_R(wildcards): + chromosomesR=[] + stringStart="c('" + for i in range(1,23): + chromosomesR.append(str(i)) + chromosomesR.append("X") + if "38" in str(wildcards.genome_build): + chromosomesR = ["chr" + x for x in chromosomesR] + chromosomesR= "','".join(chromosomesR) + stringEnd="')" + return stringStart + chromosomesR + stringEnd + +rule _run_ichorcna: + input: + tum = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}.bin{binSize}.wig", + output: + corrDepth = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.correctedDepth.txt", + param = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.params.txt", + cna = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.cna.seg", + segTxt = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.seg.txt", + seg = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}.seg", + plot = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}/{tumour_id}_genomeWide.pdf", + params: + ichorDir = CFG["dirs"]["inputs"] + "ichorCNA/", + outDir = CFG["dirs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}/", + rscript = CFG["options"]["run"]["ichorCNA_rscript"], + name = "{tumour_id}", + ploidy = CFG["options"]["run"]["ichorCNA_ploidy"], + normal = CFG["options"]["run"]["ichorCNA_normal"], + gcwig = op.switch_on_wildcard("binSize", CFG["options"]["run"]["ichorCNA_gcWig"]), + mapwig = op.switch_on_wildcard("binSize", CFG["options"]["run"]["ichorCNA_mapWig"]), + normalpanel = op.switch_on_wildcard("binSize", CFG["options"]["run"]["ichorCNA_normalPanel"]), + estimateNormal = CFG["options"]["run"]["ichorCNA_estimateNormal"], + estimatePloidy = CFG["options"]["run"]["ichorCNA_estimatePloidy"], + estimateClonality = CFG["options"]["run"]["ichorCNA_estimateClonality"], + scStates = CFG["options"]["run"]["ichorCNA_scStates"], + maxCN = CFG["options"]["run"]["ichorCNA_maxCN"], + includeHOMD = CFG["options"]["run"]["ichorCNA_includeHOMD"], + chrs = get_chromosomes_R, + chrTrain = op.switch_on_wildcard("genome_build", CFG["options"]["run"]["ichorCNA_chrTrain"]), + genomeBuild = "{genome_build}", + genomeStyle = op.switch_on_wildcard("genome_build", CFG["options"]["run"]["ichorCNA_genomeStyle"]), + centromere = op.switch_on_wildcard("genome_build", CFG["options"]["run"]["ichorCNA_centromere"]), + fracReadsChrYMale = CFG["options"]["run"]["ichorCNA_fracReadsInChrYForMale"], + minMapScore = CFG["options"]["run"]["ichorCNA_minMapScore"], + maxFracGenomeSubclone = CFG["options"]["run"]["ichorCNA_maxFracGenomeSubclone"], + maxFracCNASubclone = CFG["options"]["run"]["ichorCNA_maxFracCNASubclone"], + exons = CFG["options"]["run"]["ichorCNA_exons"], + txnE = CFG["options"]["run"]["ichorCNA_txnE"], + txnStrength = CFG["options"]["run"]["ichorCNA_txnStrength"], + plotFileType = CFG["options"]["run"]["ichorCNA_plotFileType"], + plotYlim = CFG["options"]["run"]["ichorCNA_plotYlim"], + libdir = CFG["dirs"]["inputs"] + "ichorCNA/" + CFG["options"]["run"]["ichorCNA_libdir"] + conda: CFG["conda_envs"]["ichorcna"] + threads: CFG["threads"]["run"] + resources: + **CFG["resources"]["run"] + log: + stdout = CFG["logs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}.stdout.log", + stderr = CFG["logs"]["seg"] + "{seq_type}--{genome_build}/{binSize}/{tumour_id}--{normal_id}--{pair_status}.stderr.log" + shell: + "Rscript {params.rscript} --id {params.name} --libdir {params.libdir} --WIG {input.tum} --gcWig {params.ichorDir}{params.gcwig} --mapWig {params.ichorDir}{params.mapwig} --normalPanel {params.ichorDir}{params.normalpanel} --ploidy \"{params.ploidy}\" --normal \"{params.normal}\" --maxCN {params.maxCN} --includeHOMD {params.includeHOMD} --chrs \"{params.chrs}\" --chrTrain \"{params.chrTrain}\" --genomeStyle {params.genomeStyle} --genomeBuild {params.genomeBuild} --estimateNormal {params.estimateNormal} --estimatePloidy {params.estimatePloidy} --estimateScPrevalence {params.estimateClonality} --scStates \"{params.scStates}\" --centromere {params.ichorDir}{params.centromere} --exons.bed {params.exons} --txnE {params.txnE} --txnStrength {params.txnStrength} --minMapScore {params.minMapScore} --fracReadsInChrYForMale {params.fracReadsChrYMale} --maxFracGenomeSubclone {params.maxFracGenomeSubclone} --maxFracCNASubclone {params.maxFracCNASubclone} --plotFileType {params.plotFileType} --plotYLim \"{params.plotYlim}\" --outDir {params.outDir} > {log.stdout} 2> {log.stderr}" + +# Symlinks the final output files into the module results directory (under '99-outputs/') +rule _ichorcna_output: + input: + corrDepth = str(rules._run_ichorcna.output.corrDepth), + param = str(rules._run_ichorcna.output.param), + cna = str(rules._run_ichorcna.output.cna), + segTxt = str(rules._run_ichorcna.output.segTxt), + seg = str(rules._run_ichorcna.output.seg), + plot = str(rules._run_ichorcna.output.plot) + output: + corrDepth = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/corrDepth/{binSize}/{tumour_id}--{normal_id}--{pair_status}.corrDepth.txt", + param = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/param/{binSize}/{tumour_id}--{normal_id}--{pair_status}.param.txt", + cna = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/binCNA/{binSize}/{tumour_id}--{normal_id}--{pair_status}.cna.seg", + segTxt = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/seg_txt/{binSize}/{tumour_id}--{normal_id}--{pair_status}.seg.txt", + seg = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/seg/{binSize}/{tumour_id}--{normal_id}--{pair_status}.seg", + plot = CFG["dirs"]["outputs"] + "{seq_type}--{genome_build}/plot/{binSize}/{tumour_id}--{normal_id}--{pair_status}_genomeWide.pdf" + run: + op.relative_symlink(input.corrDepth, output.corrDepth, in_module=True) + op.relative_symlink(input.param, output.param, in_module=True) + op.relative_symlink(input.cna, output.cna, in_module=True) + op.relative_symlink(input.segTxt, output.segTxt, in_module=True) + op.relative_symlink(input.seg, output.seg, in_module=True) + op.relative_symlink(input.plot, output.plot, in_module=True) + +# Generates the target sentinels for each run, which generate the symlinks +rule _ichorcna_all: + input: + expand( + [ + str(rules._ichorcna_output.output.corrDepth), + str(rules._ichorcna_output.output.param), + str(rules._ichorcna_output.output.cna), + str(rules._ichorcna_output.output.segTxt), + str(rules._ichorcna_output.output.seg), + str(rules._ichorcna_output.output.plot) + ], + zip, # Run expand() with zip(), not product() + seq_type=CFG["runs"]["tumour_seq_type"], + genome_build=CFG["runs"]["tumour_genome_build"], + pair_status=CFG["runs"]["pair_status"], + tumour_id=CFG["runs"]["tumour_sample_id"], + normal_id=CFG["runs"]["normal_sample_id"], + binSize=[CFG["options"]["readcounter"]["binSize"]] * len(CFG["runs"]["tumour_sample_id"])) + + + +##### CLEANUP ##### + + +# Perform some clean-up tasks, including storing the module-specific +# configuration on disk and deleting the `CFG` variable +op.cleanup_module(CFG) diff --git a/modules/ichorcna/1.0/schemas/base-1.0.yaml b/modules/ichorcna/1.0/schemas/base-1.0.yaml new file mode 120000 index 000000000..0a69d1ceb --- /dev/null +++ b/modules/ichorcna/1.0/schemas/base-1.0.yaml @@ -0,0 +1 @@ +../../../../schemas/base/base-1.0.yaml \ No newline at end of file diff --git a/modules/ichorcna/1.0/src/runIchorCNA.R b/modules/ichorcna/1.0/src/runIchorCNA.R new file mode 100755 index 000000000..ec1d7384f --- /dev/null +++ b/modules/ichorcna/1.0/src/runIchorCNA.R @@ -0,0 +1,423 @@ +# file: ichorCNA.R +# authors: Gavin Ha, Ph.D. +# Fred Hutch +# contact: +# +# Justin Rhoades +# Broad Institute +# contact: + +# ichorCNA: https://github.com/broadinstitute/ichorCNA +# date: July 24, 2019 +# description: Hidden Markov model (HMM) to analyze Ultra-low pass whole genome sequencing (ULP-WGS) data. +# This script is the main script to run the HMM. + +library(optparse) + +option_list <- list( + make_option(c("--WIG"), type = "character", help = "Path to tumor WIG file. Required."), + make_option(c("--NORMWIG"), type = "character", default=NULL, help = "Path to normal WIG file. Default: [%default]"), + make_option(c("--gcWig"), type = "character", help = "Path to GC-content WIG file; Required"), + make_option(c("--mapWig"), type = "character", default=NULL, help = "Path to mappability score WIG file. Default: [%default]"), + make_option(c("--normalPanel"), type="character", default=NULL, help="Median corrected depth from panel of normals. Default: [%default]"), + make_option(c("--exons.bed"), type = "character", default=NULL, help = "Path to bed file containing exon regions. Default: [%default]"), + make_option(c("--id"), type = "character", default="test", help = "Patient ID. Default: [%default]"), + make_option(c("--centromere"), type="character", default=NULL, help = "File containing Centromere locations; if not provided then will use hg19 version from ichorCNA package. Default: [%default]"), + make_option(c("--minMapScore"), type = "numeric", default=0.9, help="Include bins with a minimum mappability score of this value. Default: [%default]."), + make_option(c("--rmCentromereFlankLength"), type="numeric", default=1e5, help="Length of region flanking centromere to remove. Default: [%default]"), + make_option(c("--normal"), type="character", default="0.5", help = "Initial normal contamination; can be more than one value if additional normal initializations are desired. Default: [%default]"), + make_option(c("--scStates"), type="character", default="NULL", help = "Subclonal states to consider. Default: [%default]"), + make_option(c("--coverage"), type="numeric", default=NULL, help = "PICARD sequencing coverage. Default: [%default]"), + make_option(c("--lambda"), type="character", default="NULL", help="Initial Student's t precision; must contain 4 values (e.g. c(1500,1500,1500,1500)); if not provided then will automatically use based on variance of data. Default: [%default]"), + make_option(c("--lambdaScaleHyperParam"), type="numeric", default=3, help="Hyperparameter (scale) for Gamma prior on Student's-t precision. Default: [%default]"), + # make_option(c("--kappa"), type="character", default=50, help="Initial state distribution"), + make_option(c("--ploidy"), type="character", default="2", help = "Initial tumour ploidy; can be more than one value if additional ploidy initializations are desired. Default: [%default]"), + make_option(c("--maxCN"), type="numeric", default=7, help = "Total clonal CN states. Default: [%default]"), + make_option(c("--estimateNormal"), type="logical", default=TRUE, help = "Estimate normal. Default: [%default]"), + make_option(c("--estimateScPrevalence"), type="logical", default=TRUE, help = "Estimate subclonal prevalence. Default: [%default]"), + make_option(c("--estimatePloidy"), type="logical", default=TRUE, help = "Estimate tumour ploidy. Default: [%default]"), + make_option(c("--maxFracCNASubclone"), type="numeric", default=0.7, help="Exclude solutions with fraction of subclonal events greater than this value. Default: [%default]"), + make_option(c("--maxFracGenomeSubclone"), type="numeric", default=0.5, help="Exclude solutions with subclonal genome fraction greater than this value. Default: [%default]"), + make_option(c("--minSegmentBins"), type="numeric", default=50, help="Minimum number of bins for largest segment threshold required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction."), + make_option(c("--altFracThreshold"), type="numeric", default=0.05, help="Minimum proportion of bins altered required to estimate tumor fraction; if below this threshold, then will be assigned zero tumor fraction. Default: [%default]"), + make_option(c("--chrNormalize"), type="character", default="c(1:22)", help = "Specify chromosomes to normalize GC/mappability biases. Default: [%default]"), + make_option(c("--chrTrain"), type="character", default="c(1:22)", help = "Specify chromosomes to estimate params. Default: [%default]"), + make_option(c("--chrs"), type="character", default="c(1:22,\"X\")", help = "Specify chromosomes to analyze. Default: [%default]"), + make_option(c("--genomeBuild"), type="character", default="hg19", help="Geome build. Default: [%default]"), + make_option(c("--genomeStyle"), type = "character", default = "NCBI", help = "NCBI or UCSC chromosome naming convention; use UCSC if desired output is to have \"chr\" string. [Default: %default]"), + make_option(c("--normalizeMaleX"), type="logical", default=TRUE, help = "If male, then normalize chrX by median. Default: [%default]"), + make_option(c("--minTumFracToCorrect"), type="numeric", default=0.1, help = "Tumor-fraction correction of bin and segment-level CNA if sample has minimum estimated tumor fraction. [Default: %default]"), + make_option(c("--fracReadsInChrYForMale"), type="numeric", default=0.001, help = "Threshold for fraction of reads in chrY to assign as male. Default: [%default]"), + make_option(c("--includeHOMD"), type="logical", default=FALSE, help="If FALSE, then exclude HOMD state. Useful when using large bins (e.g. 1Mb). Default: [%default]"), + make_option(c("--txnE"), type="numeric", default=0.9999999, help = "Self-transition probability. Increase to decrease number of segments. Default: [%default]"), + make_option(c("--txnStrength"), type="numeric", default=1e7, help = "Transition pseudo-counts. Exponent should be the same as the number of decimal places of --txnE. Default: [%default]"), + make_option(c("--plotFileType"), type="character", default="pdf", help = "File format for output plots. Default: [%default]"), + make_option(c("--plotYLim"), type="character", default="c(-2,2)", help = "ylim to use for chromosome plots. Default: [%default]"), + make_option(c("--outDir"), type="character", default="./", help = "Output Directory. Default: [%default]"), + make_option(c("--libdir"), type = "character", default=NULL, help = "Script library path. Usually exclude this argument unless custom modifications have been made to the ichorCNA R package code and the user would like to source those R files. Default: [%default]") +) +parseobj <- OptionParser(option_list=option_list) +opt <- parse_args(parseobj) +print(opt) +options(scipen=0, stringsAsFactors=F) + +library(HMMcopy) +library(GenomicRanges) +library(GenomeInfoDb) +options(stringsAsFactors=FALSE) +options(bitmapType='cairo') + +patientID <- opt$id +tumour_file <- opt$WIG +normal_file <- opt$NORMWIG +gcWig <- opt$gcWig +mapWig <- opt$mapWig +normal_panel <- opt$normalPanel +exons.bed <- opt$exons.bed # "0" if none specified +centromere <- opt$centromere +minMapScore <- opt$minMapScore +flankLength <- opt$rmCentromereFlankLength +normal <- eval(parse(text = opt$normal)) +scStates <- eval(parse(text = opt$scStates)) +lambda <- eval(parse(text = opt$lambda)) +lambdaScaleHyperParam <- opt$lambdaScaleHyperParam +estimateNormal <- opt$estimateNormal +estimatePloidy <- opt$estimatePloidy +estimateScPrevalence <- opt$estimateScPrevalence +maxFracCNASubclone <- opt$maxFracCNASubclone +maxFracGenomeSubclone <- opt$maxFracGenomeSubclone +minSegmentBins <- opt$minSegmentBins +altFracThreshold <- opt$altFracThreshold +ploidy <- eval(parse(text = opt$ploidy)) +coverage <- opt$coverage +maxCN <- opt$maxCN +txnE <- opt$txnE +txnStrength <- opt$txnStrength +normalizeMaleX <- as.logical(opt$normalizeMaleX) +includeHOMD <- as.logical(opt$includeHOMD) +minTumFracToCorrect <- opt$minTumFracToCorrect +fracReadsInChrYForMale <- opt$fracReadsInChrYForMale +chrXMedianForMale <- -0.1 +outDir <- opt$outDir +libdir <- opt$libdir +plotFileType <- opt$plotFileType +plotYLim <- eval(parse(text=opt$plotYLim)) +gender <- NULL +outImage <- paste0(outDir,"/", patientID,".RData") +genomeBuild <- opt$genomeBuild +genomeStyle <- opt$genomeStyle +chrs <- as.character(eval(parse(text = opt$chrs))) +chrTrain <- as.character(eval(parse(text=opt$chrTrain))); +chrNormalize <- as.character(eval(parse(text=opt$chrNormalize))); +seqlevelsStyle(chrs) <- genomeStyle +seqlevelsStyle(chrNormalize) <- genomeStyle +seqlevelsStyle(chrTrain) <- genomeStyle + +## load ichorCNA library or source R scripts +if (!is.null(libdir) && libdir != "None"){ + source(paste0(libdir,"/R/utils.R")) + source(paste0(libdir,"/R/segmentation.R")) + source(paste0(libdir,"/R/EM.R")) + source(paste0(libdir,"/R/output.R")) + source(paste0(libdir,"/R/plotting.R")) +} else { + library(ichorCNA) +} + +## load seqinfo +# seqinfo <- getSeqInfo(genomeBuild, genomeStyle) +seqinfo <- NULL + +if (substr(tumour_file,nchar(tumour_file)-2,nchar(tumour_file)) == "wig") { + wigFiles <- data.frame(cbind(patientID, tumour_file)) +} else { + wigFiles <- read.delim(tumour_file, header=F, as.is=T) +} + +## FILTER BY EXONS IF PROVIDED ## +## add gc and map to GRanges object ## +if (is.null(exons.bed) || exons.bed == "None" || exons.bed == "NULL"){ + targetedSequences <- NULL +}else{ + targetedSequences <- read.delim(exons.bed, header=T, sep="\t") +} + +## load PoN +if (is.null(normal_panel) || normal_panel == "None" || normal_panel == "NULL"){ + normal_panel <- NULL +} + +if (is.null(centromere) || centromere == "None" || centromere == "NULL"){ # no centromere file provided + centromere <- system.file("extdata", "GRCh37.p13_centromere_UCSC-gapTable.txt", + package = "ichorCNA") +} +centromere <- read.delim(centromere,header=T,stringsAsFactors=F,sep="\t") +save.image(outImage) +## LOAD IN WIG FILES ## +numSamples <- nrow(wigFiles) + +tumour_copy <- list() +for (i in 1:numSamples) { + id <- wigFiles[i,1] + ## create output directories for each sample ## + dir.create(paste0(outDir, "/", id, "/"), recursive = TRUE) + ### LOAD TUMOUR AND NORMAL FILES ### + message("Loading tumour file:", wigFiles[i,1]) + tumour_reads <- wigToGRanges(wigFiles[i,2]) + + ## LOAD GC/MAP WIG FILES ### + # find the bin size and load corresponding wig files # + binSize <- as.data.frame(tumour_reads[1,])$width + message("Reading GC and mappability files") + if (is.null(gcWig) || gcWig == "None" || gcWig == "NULL"){ + stop("GC wig file is required") + } + gc <- wigToGRanges(gcWig) + if (is.null(mapWig) || mapWig == "None" || mapWig == "NULL"){ + message("No mappability wig file input, excluding from correction") + map <- NULL + } else { + map <- wigToGRanges(mapWig) + } + message("Correcting Tumour") + + counts <- loadReadCountsFromWig(tumour_reads, chrs = chrs, gc = gc, map = map, + centromere = centromere, flankLength = flankLength, + targetedSequences = targetedSequences, chrXMedianForMale = chrXMedianForMale, + genomeStyle = genomeStyle, fracReadsInChrYForMale = fracReadsInChrYForMale, + chrNormalize = chrNormalize, mapScoreThres = minMapScore) + tumour_copy[[id]] <- counts$counts #as(counts$counts, "GRanges") + gender <- counts$gender + ## load in normal file if provided + if (!is.null(normal_file) && normal_file != "None" && normal_file != "NULL"){ + message("Loading normal file:", normal_file) + normal_reads <- wigToGRanges(normal_file) + message("Correcting Normal") + counts <- loadReadCountsFromWig(normal_reads, chrs=chrs, gc=gc, map=map, + centromere=centromere, flankLength = flankLength, targetedSequences=targetedSequences, + genomeStyle = genomeStyle, chrNormalize = chrNormalize, mapScoreThres = minMapScore) + normal_copy <- counts$counts #as(counts$counts, "GRanges") + gender.normal <- counts$gender + }else{ + normal_copy <- NULL + } + + ### DETERMINE GENDER ### + ## if normal file not given, use chrY, else use chrX + message("Determining gender...", appendLF = FALSE) + gender.mismatch <- FALSE + if (!is.null(normal_copy)){ + if (gender$gender != gender.normal$gender){ #use tumour # use normal if given + # check if normal is same gender as tumour + gender.mismatch <- TRUE + } + } + message("Gender ", gender$gender) + + ## NORMALIZE GENOME-WIDE BY MATCHED NORMAL OR NORMAL PANEL (MEDIAN) ## + tumour_copy[[id]] <- normalizeByPanelOrMatchedNormal(tumour_copy[[id]], chrs = chrs, + normal_panel = normal_panel, normal_copy = normal_copy, + gender = gender$gender, normalizeMaleX = normalizeMaleX) + + ### OUTPUT FILE ### + ### PUTTING TOGETHER THE COLUMNS IN THE OUTPUT ### + outMat <- as.data.frame(tumour_copy[[id]]) + #outMat <- outMat[,c(1,2,3,12)] + outMat <- outMat[,c("seqnames","start","end","copy")] + colnames(outMat) <- c("chr","start","end","log2_TNratio_corrected") + outFile <- paste0(outDir,"/",id,".correctedDepth.txt") + message(paste("Outputting to:", outFile)) + write.table(outMat, file=outFile, row.names=F, col.names=T, quote=F, sep="\t") + +} ## end of for each sample + +chrInd <- as.character(seqnames(tumour_copy[[1]])) %in% chrTrain +## get positions that are valid +valid <- tumour_copy[[1]]$valid +if (length(tumour_copy) >= 2) { + for (i in 2:length(tumour_copy)){ + valid <- valid & tumour_copy[[i]]$valid + } +} +save.image(outImage) + +### RUN HMM ### +## store the results for different normal and ploidy solutions ## +ptmTotalSolutions <- proc.time() # start total timer +results <- list() +loglik <- as.data.frame(matrix(NA, nrow = length(normal) * length(ploidy), ncol = 7, + dimnames = list(c(), c("init", "n_est", "phi_est", "BIC", + "Frac_genome_subclonal", "Frac_CNA_subclonal", "loglik")))) +counter <- 1 +compNames <- rep(NA, nrow(loglik)) +mainName <- rep(NA, length(normal) * length(ploidy)) +#### restart for purity and ploidy values #### +for (n in normal){ + for (p in ploidy){ + if (n == 0.95 & p != 2) { + next + } + logR <- as.data.frame(lapply(tumour_copy, function(x) { x$copy })) # NEED TO EXCLUDE CHR X # + param <- getDefaultParameters(logR[valid & chrInd, , drop=F], maxCN = maxCN, includeHOMD = includeHOMD, + ct.sc=scStates, ploidy = floor(p), e=txnE, e.same = 50, strength=txnStrength) + param$phi_0 <- rep(p, numSamples) + param$n_0 <- rep(n, numSamples) + + ############################################ + ######## CUSTOM PARAMETER SETTINGS ######### + ############################################ + # 0.1x cfDNA # + if (is.null(lambda)){ + logR.var <- 1 / ((apply(logR, 2, sd, na.rm = TRUE) / sqrt(length(param$ct))) ^ 2) + param$lambda <- rep(logR.var, length(param$ct)) + param$lambda[param$ct %in% c(2)] <- logR.var + param$lambda[param$ct %in% c(1,3)] <- logR.var + param$lambda[param$ct >= 4] <- logR.var / 5 + param$lambda[param$ct == max(param$ct)] <- logR.var / 15 + param$lambda[param$ct.sc.status] <- logR.var / 10 + }else{ + param$lambda[param$ct %in% c(2)] <- lambda[2] + param$lambda[param$ct %in% c(1)] <- lambda[1] + param$lambda[param$ct %in% c(3)] <- lambda[3] + param$lambda[param$ct >= 4] <- lambda[4] + param$lambda[param$ct == max(param$ct)] <- lambda[2] / 15 + param$lambda[param$ct.sc.status] <- lambda[2] / 10 + } + param$alphaLambda <- rep(lambdaScaleHyperParam, length(param$ct)) + # 1x bulk tumors # + #param$lambda[param$ct %in% c(2)] <- 2000 + #param$lambda[param$ct %in% c(1)] <- 1750 + #param$lambda[param$ct %in% c(3)] <- 1750 + #param$lambda[param$ct >= 4] <- 1500 + #param$lambda[param$ct == max(param$ct)] <- 1000 / 25 + #param$lambda[param$ct.sc.status] <- 1000 / 75 + #param$alphaLambda[param$ct.sc.status] <- 4 + #param$alphaLambda[param$ct %in% c(1,3)] <- 5 + #param$alphaLambda[param$ct %in% c(2)] <- 5 + #param$alphaLambda[param$ct == max(param$ct)] <- 4 + + ############################################# + ################ RUN HMM #################### + ############################################# + hmmResults.cor <- HMMsegment(tumour_copy, valid, dataType = "copy", + param = param, chrTrain = chrTrain, maxiter = 50, + estimateNormal = estimateNormal, estimatePloidy = estimatePloidy, + estimateSubclone = estimateScPrevalence, verbose = TRUE) + + for (s in 1:numSamples){ + iter <- hmmResults.cor$results$iter + id <- names(hmmResults.cor$cna)[s] + + ## convert full diploid solution (of chrs to train) to have 1.0 normal or 0.0 purity + ## check if there is an altered segment that has at least a minimum # of bins + segsS <- hmmResults.cor$results$segs[[s]] + segsS <- segsS[segsS$chr %in% chrTrain, ] + segAltInd <- which(segsS$event != "NEUT") + maxBinLength = -Inf + if (sum(segAltInd) > 0){ + maxInd <- which.max(segsS$end[segAltInd] - segsS$start[segAltInd] + 1) + maxSegRD <- GRanges(seqnames=segsS$chr[segAltInd[maxInd]], + ranges=IRanges(start=segsS$start[segAltInd[maxInd]], end=segsS$end[segAltInd[maxInd]])) + hits <- findOverlaps(query=maxSegRD, subject=tumour_copy[[s]][valid, ]) + maxBinLength <- length(subjectHits(hits)) + } + ## check if there are proportion of total bins altered + # if segment size smaller than minSegmentBins, but altFrac > altFracThreshold, then still estimate TF + cnaS <- hmmResults.cor$cna[[s]] + altInd <- cnaS[cnaS$chr %in% chrTrain, "event"] == "NEUT" + altFrac <- sum(!altInd, na.rm=TRUE) / length(altInd) + if ((maxBinLength <= minSegmentBins) & (altFrac <= altFracThreshold)){ + hmmResults.cor$results$n[s, iter] <- 1.0 + } + + # correct integer copy number based on estimated purity and ploidy + correctedResults <- correctIntegerCN(cn = hmmResults.cor$cna[[s]], + segs = hmmResults.cor$results$segs[[s]], + purity = 1 - hmmResults.cor$results$n[s, iter], ploidy = hmmResults.cor$results$phi[s, iter], + cellPrev = 1 - hmmResults.cor$results$sp[s, iter], + maxCNtoCorrect.autosomes = maxCN, maxCNtoCorrect.X = maxCN, minPurityToCorrect = minTumFracToCorrect, + gender = gender$gender, chrs = chrs, correctHOMD = includeHOMD) + hmmResults.cor$results$segs[[s]] <- correctedResults$segs + hmmResults.cor$cna[[s]] <- correctedResults$cn + + ## plot solution ## + outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_", "n", n, "-p", p) + mainName[counter] <- paste0(id, ", n: ", n, ", p: ", p, ", log likelihood: ", signif(hmmResults.cor$results$loglik[hmmResults.cor$results$iter], digits = 4)) + plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType=plotFileType, + logR.column = "logR", call.column = "Corrected_Call", + plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, seqinfo=seqinfo, main=mainName[counter]) + } + iter <- hmmResults.cor$results$iter + results[[counter]] <- hmmResults.cor + loglik[counter, "loglik"] <- signif(hmmResults.cor$results$loglik[iter], digits = 4) + subClonalBinCount <- unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$subclone.status) })) + fracGenomeSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ nrow(x) })) + fracAltSub <- subClonalBinCount / unlist(lapply(hmmResults.cor$cna, function(x){ sum(x$copy.number != 2) })) + fracAltSub <- lapply(fracAltSub, function(x){if (is.na(x)){0}else{x}}) + loglik[counter, "Frac_genome_subclonal"] <- paste0(signif(fracGenomeSub, digits=2), collapse=",") + loglik[counter, "Frac_CNA_subclonal"] <- paste0(signif(as.numeric(fracAltSub), digits=2), collapse=",") + loglik[counter, "init"] <- paste0("n", n, "-p", p) + loglik[counter, "n_est"] <- paste(signif(hmmResults.cor$results$n[, iter], digits = 2), collapse = ",") + loglik[counter, "phi_est"] <- paste(signif(hmmResults.cor$results$phi[, iter], digits = 4), collapse = ",") + + counter <- counter + 1 + } +} +## get total time for all solutions ## +elapsedTimeSolutions <- proc.time() - ptmTotalSolutions +message("Total ULP-WGS HMM Runtime: ", format(elapsedTimeSolutions[3] / 60, digits = 2), " min.") + +### SAVE R IMAGE ### +save.image(outImage) +#save(tumour_copy, results, loglik, file=paste0(outDir,"/",id,".RData")) + +### SELECT SOLUTION WITH LARGEST LIKELIHOOD ### +loglik <- loglik[!is.na(loglik$init), ] +if (estimateScPrevalence){ ## sort but excluding solutions with too large % subclonal + fracInd <- which(loglik[, "Frac_CNA_subclonal"] <= maxFracCNASubclone & + loglik[, "Frac_genome_subclonal"] <= maxFracGenomeSubclone) + if (length(fracInd) > 0){ ## if there is a solution satisfying % subclonal + ind <- fracInd[order(loglik[fracInd, "loglik"], decreasing=TRUE)] + }else{ # otherwise just take largest likelihood + ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE) + } +}else{#sort by likelihood only + ind <- order(as.numeric(loglik[, "loglik"]), decreasing=TRUE) +} + +#new loop by order of solutions (ind) +outPlotFile <- paste0(outDir, "/", id, "/", id, "_genomeWide_all_sols") +for(i in 1:length(ind)) { + hmmResults.cor <- results[[ind[i]]] + turnDevOff <- FALSE + turnDevOn <- FALSE + if (i == 1){ + turnDevOn <- TRUE + } + if (i == length(ind)){ + turnDevOff <- TRUE + } + plotGWSolution(hmmResults.cor, s=s, outPlotFile=outPlotFile, plotFileType="pdf", + logR.column = "logR", call.column = "Corrected_Call", + plotYLim=plotYLim, estimateScPrevalence=estimateScPrevalence, + seqinfo = seqinfo, + turnDevOn = turnDevOn, turnDevOff = turnDevOff, main=mainName[ind[i]]) +} + +hmmResults.cor <- results[[ind[1]]] +hmmResults.cor$results$loglik <- as.data.frame(loglik) +hmmResults.cor$results$gender <- gender$gender +hmmResults.cor$results$chrYCov <- gender$chrYCovRatio +hmmResults.cor$results$chrXMedian <- gender$chrXMedian +hmmResults.cor$results$coverage <- coverage + +outputHMM(cna = hmmResults.cor$cna, segs = hmmResults.cor$results$segs, + results = hmmResults.cor$results, patientID = patientID, outDir=outDir) +outFile <- paste0(outDir, "/", patientID, ".params.txt") +outputParametersToFile(hmmResults.cor, file = outFile) + +## plot solutions for all samples +plotSolutions(hmmResults.cor, tumour_copy, chrs, outDir, numSamples=numSamples, + logR.column = "logR", call.column = "Corrected_Call", + plotFileType=plotFileType, plotYLim=plotYLim, seqinfo = seqinfo, + estimateScPrevalence=estimateScPrevalence, maxCN=maxCN) \ No newline at end of file diff --git a/modules/ichorcna/1.1/config/default.yaml b/modules/ichorcna/1.1/config/default.yaml new file mode 100644 index 000000000..4844ba015 --- /dev/null +++ b/modules/ichorcna/1.1/config/default.yaml @@ -0,0 +1,115 @@ +lcr-modules: + + ichorcna: + + inputs: + # Available wildcards: {seq_type} {genome_build} {sample_id} + sample_bam: "__UPDATE__" + sample_bai: "__UPDATE__" + + + scratch_subdirectories: [] + + options: + deeptools: + qual: 20 # only includes reads with mapping quality greater than 20 + binSize: 1000000 # set window size to compute coverage + # available binSizes are: 1000000, 500000, 50000, 10000 + flagExclude: 1028 + opt: " --ignoreDuplicates --extendReads " + run: + ichorCNA_libdir: "" + ichorCNA_rscript: "{MODSDIR}/src/runIchorCNA.R" + # use panel matching same bin size (optional) + ichorCNA_normalPanel: + "1000000": "inst/extdata/HD_ULP_PoN_{genome_build}_1Mb_median_normAutosome_median.rds" + "500000": "inst/extdata/HD_ULP_PoN_{genome_build}_500kb_median_normAutosome_median.rds" + # must use gc wig file corresponding to same binSize (required) + ichorCNA_gcWig: + "1000000": "inst/extdata/gc_{genome_build}_1000kb.wig" + "500000": "inst/extdata/gc_{genome_build}_500kb.wig" + "50000": "inst/extdata/gc_{genome_build}_50kb.wig" + "10000": "inst/extdata/gc_{genome_build}_10kb.wig" + # must use map wig file corresponding to same binSize (required) + ichorCNA_mapWig: + "1000000": "inst/extdata/map_{genome_build}_1000kb.wig" + "500000": "inst/extdata/map_{genome_build}_500kb.wig" + "50000": "inst/extdata/map_{genome_build}_50kb.wig" + "10000": "inst/extdata/map_{genome_build}_10kb.wig" + # use bed file if sample has targeted regions, eg. exome data (optional) + ichorCNA_exons: NULL + ichorCNA_centromere: + grch37: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hg19: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + hs37d5: "inst/extdata/GRCh37.p13_centromere_UCSC-gapTable.txt" + grch38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + hg38: "inst/extdata/GRCh38.GCA_000001405.2_centromere_acen.txt" + ichorCNA_minMapScore: 0.75 + ichorCNA_fracReadsInChrYForMale: 0.002 # Threshold for fraction of reads in chrY to assign as male + ichorCNA_genomeStyle: # can set this to UCSC or NCBI + grch37: "NCBI" + hg19: "NCBI" + hs37d5: "NCBI" + grch38: "UCSC" + hg38: "UCSC" + # chrs used for training ichorCNA parameters, e.g. tumor fraction. + ichorCNA_chrTrain: + grch37: "c(1:22)" + hg19: "c(1:22)" + hs37d5: "c(1:22)" + grch38: "paste0('chr', c(1:22))" + hg38: "paste0('chr', c(1:22))" + # non-tumor fraction parameter restart values; higher values should be included for cfDNA + ichorCNA_normal: "c(0.5,0.6,0.7,0.8,0.9,0.95)" + # ploidy parameter restart values + ichorCNA_ploidy: "c(2,3,4)" + ichorCNA_estimateNormal: TRUE + ichorCNA_estimatePloidy: TRUE + ichorCNA_estimateClonality: TRUE + # states to use for subclonal CN + ichorCNA_scStates: "c(1,3)" + # set maximum copy number to use + ichorCNA_maxCN: 5 + # TRUE/FALSE to include homozygous deletion state # FALSE for low coverage libraries (ex. 0.1x) ; can turn on for higher coverage data (ex. >10x) + ichorCNA_includeHOMD: FALSE + # Exclude solutions if total length of subclonal CNAs > this fraction of the genome + ichorCNA_maxFracGenomeSubclone: 0.5 + # Exclude solutions if total length of subclonal CNAs > this fraction of total CNA length + ichorCNA_maxFracCNASubclone: 0.7 + # control segmentation - higher (e.g. 0.9999999) leads to higher specificity and fewer segments + # lower (e.g. 0.99) leads to higher sensitivity and more segments + ichorCNA_txnE: 0.9399999 + # control segmentation - higher (e.g. 10000000) leads to higher specificity and fewer segments + # lower (e.g. 100) leads to higher sensitivity and more segments + ichorCNA_txnStrength: 10000 + ichorCNA_plotFileType: "pdf" + ichorCNA_plotYlim: "c(-2,2)" + + + conda_envs: + ichorcna: "{MODSDIR}/envs/ichorcna.env.yaml" + deeptools: "{MODSDIR}/envs/deeptools.env.yaml" + bedops_tools: "{MODSDIR}/envs/bedops_tools.env.yaml" + ucsc-bigwigtowig: "{MODSDIR}/envs/ucsc-bigwigtowig.env.yaml" + + threads: + deeptools: 20 + ucsc: 4 + run: 4 + + resources: + deeptools: + mem_mb: 40000 + bam: 1 + ucsc: + mem_mb: 6000 + bam: 1 + run: + mem_mb: 6000 + bam: 1 + + pairing_config: + genome: + run_paired_tumours: False + run_unpaired_tumours_with: "no_normal" + run_paired_tumours_as_unpaired: True diff --git a/modules/ichorcna/1.1/envs/bedops_tools.env.yaml b/modules/ichorcna/1.1/envs/bedops_tools.env.yaml new file mode 100644 index 000000000..2052a2e05 --- /dev/null +++ b/modules/ichorcna/1.1/envs/bedops_tools.env.yaml @@ -0,0 +1,29 @@ +name: null +channels: + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bedops=2.4.39 + - bedtools=2.30.0 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.10.26 + - curl=7.80.0 + - krb5=1.19.2 + - libcurl=7.80.0 + - libedit=3.1.20210910 + - libev=4.33 + - libgcc=7.2.0 + - libgcc-ng=9.3.0 + - libgomp=9.3.0 + - libnghttp2=1.46.0 + - libssh2=1.9.0 + - libstdcxx-ng=9.3.0 + - ncurses=6.3 + - openssl=1.1.1l + - samtools=1.7 + - xz=5.2.5 + - zlib=1.2.11 +prefix: /projects/rmorin/projects/tumour_contam/envs/bedops_tools diff --git a/modules/ichorcna/1.1/envs/deeptools.env.yaml b/modules/ichorcna/1.1/envs/deeptools.env.yaml new file mode 100644 index 000000000..fab883742 --- /dev/null +++ b/modules/ichorcna/1.1/envs/deeptools.env.yaml @@ -0,0 +1,76 @@ +name: null +channels: + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - blas=1.0 + - brotli=1.0.9 + - bzip2=1.0.8 + - c-ares=1.17.1 + - ca-certificates=2021.10.26 + - certifi=2021.10.8 + - curl=7.80.0 + - cycler=0.11.0 + - deeptools=3.5.1 + - deeptoolsintervals=0.1.9 + - fonttools=4.25.0 + - freetype=2.11.0 + - giflib=5.2.1 + - intel-openmp=2021.4.0 + - jpeg=9d + - kiwisolver=1.3.1 + - krb5=1.19.2 + - lcms2=2.12 + - ld_impl_linux-64=2.35.1 + - libcurl=7.80.0 + - libdeflate=1.0 + - libedit=3.1.20210910 + - libev=4.33 + - libffi=3.3 + - libgcc-ng=9.3.0 + - libgfortran-ng=7.5.0 + - libgfortran4=7.5.0 + - libgomp=9.3.0 + - libnghttp2=1.46.0 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libwebp=1.2.0 + - libwebp-base=1.2.0 + - lz4-c=1.9.3 + - matplotlib-base=3.5.0 + - mkl=2021.4.0 + - mkl-service=2.4.0 + - mkl_fft=1.3.1 + - mkl_random=1.2.2 + - munkres=1.0.7 + - ncurses=6.3 + - numpy=1.21.2 + - numpy-base=1.21.2 + - olefile=0.46 + - openssl=1.1.1l + - packaging=21.3 + - pillow=8.4.0 + - pip=21.2.2 + - plotly=4.14.3 + - py2bit=0.3.0 + - pybigwig=0.3.17 + - pyparsing=3.0.4 + - pysam=0.15.3 + - python=3.7.11 + - python-dateutil=2.8.2 + - readline=8.1 + - retrying=1.3.3 + - scipy=1.7.1 + - setuptools=58.0.4 + - six=1.16.0 + - sqlite=3.37.0 + - tk=8.6.11 + - wheel=0.37.0 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.9 +prefix: /projects/rmorin/projects/tumour_contam/envs/deeptools diff --git a/modules/ichorcna/1.1/envs/ichorcna.env.yaml b/modules/ichorcna/1.1/envs/ichorcna.env.yaml new file mode 100644 index 000000000..208fd501c --- /dev/null +++ b/modules/ichorcna/1.1/envs/ichorcna.env.yaml @@ -0,0 +1,108 @@ +name: null +channels: + - conda-forge + - dranew + - bioconda + - defaults + - r +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - _r-mutex=1.0.1 + - binutils_impl_linux-64=2.35.1 + - binutils_linux-64=2.35 + - bioconductor-biocgenerics=0.36.0 + - bioconductor-genomeinfodb=1.26.0 + - bioconductor-genomeinfodbdata=1.2.4 + - bioconductor-genomicranges=1.42.0 + - bioconductor-hmmcopy=1.32.0 + - bioconductor-iranges=2.24.0 + - bioconductor-s4vectors=0.28.0 + - bioconductor-xvector=0.30.0 + - bioconductor-zlibbioc=1.36.0 + - bwidget=1.9.14 + - bzip2=1.0.8 + - ca-certificates=2020.12.5 + - cairo=1.16.0 + - curl=7.71.1 + - fontconfig=2.13.1 + - freetype=2.10.4 + - fribidi=1.0.10 + - gcc_impl_linux-64=9.3.0 + - gcc_linux-64=9.3.0 + - gettext=0.19.8.1 + - gfortran_impl_linux-64=9.3.0 + - gfortran_linux-64=9.3.0 + - graphite2=1.3.13 + - gsl=2.6 + - gxx_impl_linux-64=9.3.0 + - gxx_linux-64=9.3.0 + - harfbuzz=2.8.0 + - hmmcopy_utils=0.0.1 + - icu=68.1 + - jpeg=9d + - kernel-headers_linux-64=2.6.32 + - krb5=1.17.2 + - ld_impl_linux-64=2.35.1 + - libblas=3.8.0 + - libcblas=3.8.0 + - libcurl=7.71.1 + - libedit=3.1.20191231 + - libffi=3.3 + - libgcc-devel_linux-64=9.3.0 + - libgcc-ng=9.3.0 + - libgfortran-ng=9.3.0 + - libgfortran5=9.3.0 + - libglib=2.66.7 + - libgomp=9.3.0 + - libiconv=1.16 + - liblapack=3.8.0 + - libopenblas=0.3.10 + - libpng=1.6.37 + - libssh2=1.9.0 + - libstdcxx-devel_linux-64=9.3.0 + - libstdcxx-ng=9.3.0 + - libtiff=4.2.0 + - libuuid=2.32.1 + - libwebp-base=1.2.0 + - libxcb=1.13 + - libxml2=2.9.10 + - lz4-c=1.9.3 + - make=4.3 + - ncurses=6.2 + - openssl=1.1.1j + - pango=1.42.4 + - pcre=8.44 + - pcre2=10.36 + - pixman=0.40.0 + - pthread-stubs=0.4 + - r-base=4.0.3 + - r-bitops=1.0_6 + - r-data.table=1.14.0 + - r-getopt=1.20.3 + - r-ichorcna=0.2.0 + - r-optparse=1.6.6 + - r-plyr=1.8.6 + - r-rcpp=1.0.6 + - r-rcurl=1.98_1.2 + - readline=8.0 + - sed=4.8 + - sysroot_linux-64=2.12 + - tk=8.6.10 + - tktable=2.10 + - xorg-kbproto=1.0.7 + - xorg-libice=1.0.10 + - xorg-libsm=1.2.3 + - xorg-libx11=1.7.0 + - xorg-libxau=1.0.9 + - xorg-libxdmcp=1.1.3 + - xorg-libxext=1.3.4 + - xorg-libxrender=0.9.10 + - xorg-libxt=1.2.1 + - xorg-renderproto=0.11.1 + - xorg-xextproto=7.3.0 + - xorg-xproto=7.0.31 + - xz=5.2.5 + - zlib=1.2.11 + - zstd=1.4.9 +prefix: /projects/rmorin/projects/tumour_contam/envs/ichorcna diff --git a/modules/ichorcna/1.1/envs/ucsc-bigwigtowig.env.yaml b/modules/ichorcna/1.1/envs/ucsc-bigwigtowig.env.yaml new file mode 100644 index 000000000..4d035cb87 --- /dev/null +++ b/modules/ichorcna/1.1/envs/ucsc-bigwigtowig.env.yaml @@ -0,0 +1,19 @@ +name: null +channels: + - bioconda + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - ca-certificates=2021.10.26 + - libgcc=7.2.0 + - libgcc-ng=9.3.0 + - libgomp=9.3.0 + - libpng=1.6.37 + - libstdcxx-ng=9.3.0 + - libuuid=1.0.3 + - mysql-connector-c=6.1.6 + - openssl=1.0.2u + - ucsc-bigwigtowig=366 + - zlib=1.2.11 +prefix: /projects/rmorin/projects/tumour_contam/envs/ucsc-bigwigtowig diff --git a/modules/ichorcna/1.1/ichorcna.smk b/modules/ichorcna/1.1/ichorcna.smk new file mode 100644 index 000000000..8391dee52 --- /dev/null +++ b/modules/ichorcna/1.1/ichorcna.smk @@ -0,0 +1,431 @@ +#!/usr/bin/env snakemake + + +# ---------------------------------------------------------------------------- # +##### ATTRIBUTION ##### +# ---------------------------------------------------------------------------- # + +# Original snakemake author: Jasper Wong +# Module author: Jasper Wong +# Additional contributors: N/A + + +# ---------------------------------------------------------------------------- # +##### SETUP ##### +# ---------------------------------------------------------------------------- # + +### Modules ### + +import pandas as pd +import numpy as np +import oncopipe as op +import glob +import os + +# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe +min_oncopipe_version="1.0.11" +import pkg_resources +try: + from packaging import version +except ModuleNotFoundError: + sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") + +# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe + +current_version = pkg_resources.get_distribution("oncopipe").version +if version.parse(current_version) < version.parse(min_oncopipe_version): + logger.warning( + '\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' + "\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' + ) + sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") + +# End of dependency checking section + +### Directories ### +# Setup module and store module-specific configuration in `CFG`. +CFG = op.setup_module( + name = "ichorcna", + version = "1.0", + subdirectories = ["inputs", "readDepth", "seg", "outputs"] +) + +localrules: + _ichorcna_input_bam, + _ichorcna_output, + _ichorcna_all + +# ---------------------------------------------------------------------------- # +##### RULES ##### +# ---------------------------------------------------------------------------- # + +### Set-up dependencies and packages ### +# Download github and all external files for ichorCNA: (needed since their extdata is not complete for all genome builds) +rule _install_ichorcna: + output: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + outdir = CFG["dirs"]["inputs"] + "ichorCNA/" + conda: + CFG["conda_envs"]["ichorcna"] + shell: + op.as_one_line(""" + git clone git://github.com/broadinstitute/ichorCNA.git {params.outdir} && + touch {output.complete}""") + +# This defines the script/extdata directory used by ichorCNA in the subsequent rules: +ichorDir = CFG["dirs"]["inputs"] + "ichorCNA/inst/extdata/" + +# Symlinks the extdata appropriately +rule _setup_ichorcna_extdata: + input: + complete = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success" + params: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_1Mb_median_normAutosome_mapScoreFiltered_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_500kb_median_normAutosome_mapScoreFiltered_median.rds", + hg38_1Mb_rds = ichorDir + "HD_ULP_PoN_hg38_1Mb_median_normAutosome_median.rds", + hg38_500kb_rds = ichorDir + "HD_ULP_PoN_hg38_500kb_median_normAutosome_median.rds", + hg19_1000kb_gc = ichorDir + "gc_hg19_1000kb.wig", + hg19_500kb_gc = ichorDir + "gc_hg19_500kb.wig", + hg19_50kb_gc = ichorDir + "gc_hg19_50kb.wig", + hg19_10kb_gc = ichorDir + "gc_hg19_10kb.wig", + hg38_1000kb_gc = ichorDir + "gc_hg38_1000kb.wig", + hg38_500kb_gc = ichorDir + "gc_hg38_500kb.wig", + hg38_50kb_gc = ichorDir + "gc_hg38_50kb.wig", + hg38_10kb_gc = ichorDir + "gc_hg38_10kb.wig", + hg19_1000kb_map = ichorDir + "map_hg19_1000kb.wig", + hg19_500kb_map = ichorDir + "map_hg19_500kb.wig", + hg19_50kb_map = ichorDir + "map_hg19_50kb.wig", + hg19_10kb_map = ichorDir + "map_hg19_10kb.wig", + hg38_1000kb_map = ichorDir + "map_hg38_1000kb.wig", + hg38_500kb_map = ichorDir + "map_hg38_500kb.wig", + hg38_50kb_map = ichorDir + "map_hg38_50kb.wig", + hg38_10kb_map = ichorDir + "map_hg38_10kb.wig", + output: + hg19_1Mb_rds = ichorDir + "HD_ULP_PoN_hg19_1Mb_median_normAutosome_median.rds", + hg19_500kb_rds = ichorDir + "HD_ULP_PoN_hg19_500kb_median_normAutosome_median.rds", + grch37_1Mb_rds = ichorDir + "HD_ULP_PoN_grch37_1Mb_median_normAutosome_median.rds", + grch37_500kb_rds = ichorDir + "HD_ULP_PoN_grch37_500kb_normAutosome_median.rds", + hs37d5_1Mb_rds = ichorDir + "HD_ULP_PoN_hs37d5_1Mb_median_normAutosome_median.rds", + hs37d5_500kb_rds = ichorDir + "HD_ULP_PoN_hs37d5_500kb_normAutosome_median.rds", + grch38_1Mb_rds = ichorDir + "HD_ULP_PoN_grch38_1Mb_median_normAutosome_median.rds", + grch38_500kb_rds = ichorDir + "HD_ULP_PoN_grch38_500kb_median_normAutosome_median.rds", + grch37_1000kb_gc = ichorDir + "gc_grch37_1000kb.wig", + grch37_500kb_gc = ichorDir + "gc_grch37_500kb.wig", + grch37_50kb_gc = ichorDir + "gc_grch37_50kb.wig", + grch37_10kb_gc = ichorDir + "gc_grch37_10kb.wig", + hs37d5_1000kb_gc = ichorDir + "gc_hs37d5_1000kb.wig", + hs37d5_500kb_gc = ichorDir + "gc_hs37d5_500kb.wig", + hs37d5_50kb_gc = ichorDir + "gc_hs37d5_50kb.wig", + hs37d5_10kb_gc = ichorDir + "gc_hs37d5_10kb.wig", + grch38_1000kb_gc = ichorDir + "gc_grch38_1000kb.wig", + grch38_500kb_gc = ichorDir + "gc_grch38_500kb.wig", + grch38_50kb_gc = ichorDir + "gc_grch38_50kb.wig", + grch38_10kb_gc = ichorDir + "gc_grch38_10kb.wig", + grch37_1000kb_map = ichorDir + "map_grch37_1000kb.wig", + grch37_500kb_map = ichorDir + "map_grch37_500kb.wig", + grch37_50kb_map = ichorDir + "map_grch37_50kb.wig", + grch37_10kb_map = ichorDir + "map_grch37_10kb.wig", + hs37d5_1000kb_map = ichorDir + "map_hs37d5_1000kb.wig", + hs37d5_500kb_map = ichorDir + "map_hs37d5_500kb.wig", + hs37d5_50kb_map = ichorDir + "map_hs37d5_50kb.wig", + hs37d5_10kb_map = ichorDir + "map_hs37d5_10kb.wig", + grch38_1000kb_map = ichorDir + "map_grch38_1000kb.wig", + grch38_500kb_map = ichorDir + "map_grch38_500kb.wig", + grch38_50kb_map = ichorDir + "map_grch38_50kb.wig", + grch38_10kb_map = ichorDir + "map_grch38_10kb.wig", + complete = touch(ichorDir + "symlink.done") + run: + op.relative_symlink(params.hg19_1Mb_rds, output.hg19_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hg19_500kb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.grch37_1Mb_rds) + op.relative_symlink(params.hg19_1Mb_rds, output.hs37d5_1Mb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.grch37_500kb_rds) + op.relative_symlink(params.hg19_500kb_rds, output.hs37d5_500kb_rds) + op.relative_symlink(params.hg38_1Mb_rds, output.grch38_1Mb_rds) + op.relative_symlink(params.hg38_500kb_rds, output.grch38_500kb_rds) + op.relative_symlink(params.hg19_1000kb_gc, output.grch37_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.grch37_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.grch37_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.grch37_10kb_gc) + op.relative_symlink(params.hg19_1000kb_gc, output.hs37d5_1000kb_gc) + op.relative_symlink(params.hg19_500kb_gc, output.hs37d5_500kb_gc) + op.relative_symlink(params.hg19_50kb_gc, output.hs37d5_50kb_gc) + op.relative_symlink(params.hg19_10kb_gc, output.hs37d5_10kb_gc) + op.relative_symlink(params.hg38_1000kb_gc, output.grch38_1000kb_gc) + op.relative_symlink(params.hg38_500kb_gc, output.grch38_500kb_gc) + op.relative_symlink(params.hg38_50kb_gc, output.grch38_50kb_gc) + op.relative_symlink(params.hg38_10kb_gc, output.grch38_10kb_gc) + op.relative_symlink(params.hg19_1000kb_map, output.grch37_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.grch37_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.grch37_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.grch37_10kb_map) + op.relative_symlink(params.hg19_1000kb_map, output.hs37d5_1000kb_map) + op.relative_symlink(params.hg19_500kb_map, output.hs37d5_500kb_map) + op.relative_symlink(params.hg19_50kb_map, output.hs37d5_50kb_map) + op.relative_symlink(params.hg19_10kb_map, output.hs37d5_10kb_map) + op.relative_symlink(params.hg38_1000kb_map, output.grch38_1000kb_map) + op.relative_symlink(params.hg38_500kb_map, output.grch38_500kb_map) + op.relative_symlink(params.hg38_50kb_map, output.grch38_50kb_map) + op.relative_symlink(params.hg38_10kb_map, output.grch38_10kb_map) + +### Run ichorCNA ### +# Symlinks the input files into the module results directory (under '00-inputs/') +rule _ichorcna_input_bam: + input: + bam = CFG["inputs"]["sample_bam"], + bai = CFG["inputs"]["sample_bai"] + output: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai" + run: + op.absolute_symlink(input.bam, output.bam) + op.absolute_symlink(input.bai, output.bai) + op.absolute_symlink(input.bai, output.crai) + + +# set-up for CRAM files (readCounter does not work with CRAM) +# deeptools to get .bw from .bam and .cram +rule _ichorcna_bamCoverage: + input: + bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", + bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", + crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai", + ichorcna_package = CFG["dirs"]["inputs"] + "ichorcna_dependencies_installed.success", + symlink_complete = ichorDir + "symlink.done" + output: + bw = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/{sample_id}.bin{binSize}.bw" + params: + binSize = CFG["options"]["deeptools"]["binSize"], + qual = CFG["options"]["deeptools"]["qual"], + excludeFlag = CFG["options"]["deeptools"]["flagExclude"], + opt = CFG["options"]["deeptools"]["opt"], + dirOut = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/" + conda: CFG["conda_envs"]["deeptools"] + threads: CFG["threads"]["deeptools"] + resources: + **CFG["resources"]["deeptools"] + log: + CFG["logs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/{sample_id}.bin{binSize}.log" + shell: + """ + mkdir -p {params.dirOut}; + bamCoverage -b {input.bam} --binSize {params.binSize} --minMappingQuality {params.qual} --samFlagExclude {params.excludeFlag} {params.opt} -o {output.bw} -p {threads} + """ + + +# Converts bigWig to Wig +rule _ichorcna_bigwigToWig: + input: + bw = CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/bw/{sample_id}.bin{binSize}.bw" + output: + wig_int = temp(CFG["dirs"]["readDepth"] + "{seq_type}--{genome_build}/{binSize}/wig/{sample_id}.bin{binSize}{chrom}.wig"), + conda: CFG["conda_envs"]["ucsc-bigwigtowig"] + threads: CFG["threads"]["ucsc"] + resources: + **CFG["resources"]["ucsc"] + wildcard_constraints: + chrom = ".+(?