diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index 5e60ab3f..4a8e57c2 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -15,7 +15,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version @@ -58,6 +58,27 @@ localrules: _lofreq_all, _lofreq_link_to_preprocessed +def _lofreq_get_capspace(wildcards): + CFG=config["lcr-modules"]["lofreq"] + default_bed = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed") + if str(wildcards.seq_type) in CFG['switches']['regions_bed'].keys(): + custom_bed = CFG['switches']['regions_bed'][wildcards.seq_type] + else: + custom_bed = default_bed + try: + if "tumour_id" in wildcards.keys(): + # Get the appropriate capture space for this sample + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed") + else: + this_bed = op.get_capture_space(CFG, wildcards.normal_id, wildcards.genome_build, wildcards.seq_type, "bed") + this_bed = reference_files(this_bed) + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + this_bed = custom_bed if custom_bed else default_bed + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + return custom_bed if custom_bed else default_bed + return this_bed ##### RULES ##### @@ -84,7 +105,7 @@ rule _lofreq_preprocess_normal: normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect - bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"]) + bed = _lofreq_get_capspace output: preprocessing_start = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/preprocessing.started", vcf_relaxed = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/normal_relaxed.vcf.gz", @@ -164,7 +185,7 @@ rule _lofreq_run_tumour_unmatched: vcf_relaxed = str(rules._lofreq_link_to_preprocessed.output.vcf_relaxed), fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect - bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"]) + bed = _lofreq_get_capspace output: vcf_snvs_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz", vcf_indels_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.indels.vcf.gz", @@ -202,7 +223,7 @@ rule _lofreq_run_tumour_matched: normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect - bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"]) + bed = _lofreq_get_capspace output: vcf_relaxed = temp(CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/normal_relaxed.vcf.gz"), vcf_snvs_filtered = temp(CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz"), diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk index 41f33b1a..3a84e077 100644 --- a/modules/manta/2.3/manta.smk +++ b/modules/manta/2.3/manta.smk @@ -6,7 +6,7 @@ # Original Snakefile Author: Bruno Grande # Module Author: Bruno Grande -# Additional Contributors: N/A +# Additional Contributors: Chris Rushton ##### SETUP ##### @@ -15,7 +15,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version @@ -86,6 +86,20 @@ rule _manta_index_bed: tabix {output.bedz} """) +def _manta_get_capspace(wildcards): + CFG = config["lcr-modules"]["manta"] + try: + # Get the appropriate capture space for this sample + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = reference_files(this_bed) + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + return this_bed + # Configures the manta workflow with the input BAM files and reference FASTA file. rule _manta_configure_paired: input: @@ -93,7 +107,7 @@ rule _manta_configure_paired: normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]), - bedz = str(rules._manta_index_bed.output.bedz) + bedz = _manta_get_capspace output: runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" log: @@ -120,7 +134,7 @@ rule _manta_configure_unpaired: tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]), - bedz = str(rules._manta_index_bed.output.bedz) + bedz = _manta_get_capspace output: runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" log: @@ -135,7 +149,7 @@ rule _manta_configure_unpaired: CFG["conda_envs"]["manta"] shell: op.as_one_line(""" - configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {input.bedz} + configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {params.bedz} --runDir "$(dirname {output.runwf})" {params.tumour_bam_arg_name} {input.tumour_bam} --config {input.config} > {log.stdout} 2> {log.stderr} """) @@ -291,6 +305,7 @@ def _manta_predict_output(wildcards): return outputs_with_bedpe + outputs_without_bedpe + # Generates the target symlinks for each run depending on the Manta output VCF files rule _manta_dispatch: input: diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 7b88b919..cdf0c5af 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -15,9 +15,10 @@ # Import package with useful functions for developing analysis modules import oncopipe as op import inspect +import pandas as pd # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version @@ -74,7 +75,20 @@ localrules: ##### RULES ##### - +def _mutect_get_capspace(wildcards): + CFG = config["lcr-modules"]["mutect2"] + try: + # Get the appropriate capture space for this sample + cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") + cap_space = reference_files(cap_space) + this_space = cap_space + # If we are using an older version of the reference workflow, we don't need to do anything + except NameError: + this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed") + # If this isn't a capture sample, we don't have a capture space, so return nothing + if wildcards.seq_type != "capture": + this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed") + return this_space # Symlinks the input files into the module results directory (under '00-inputs/') rule _mutect2_input_bam: @@ -97,11 +111,27 @@ rule _mutect2_dummy_positions: # Symlink chromosomes used for parallelization checkpoint _mutect2_input_chrs: input: - chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt") + candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output), + chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"), + capture_arg = _mutect_get_capspace output: - chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt" + chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt" run: - op.absolute_symlink(input.chrs, output.chrs) + # obtain list of main chromosomes + main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None) + main_chrs = main_chrs.iloc[:, 0].astype(str).unique().tolist() + #obtain list of chromosomes in candidate positions + candidate_chrs = pd.read_csv(input.candidate_positions, comment='#', sep='\t') + candidate_chrs = candidate_chrs.iloc[:, 0].astype(str).unique().tolist() + # obtain list of chromosomes in the capture space + interval_chrs = pd.read_csv(input.capture_arg, comment='@', sep='\t') + interval_chrs = interval_chrs.iloc[:, 0].astype(str).unique().tolist() + # intersect the three lists to obtain chromosomes present in all + intersect_chrs = list(set(main_chrs) & set(candidate_chrs) & set(interval_chrs)) + # convert list to single-column df + intersect_chrs = pd.DataFrame(intersect_chrs).sort_values(0) + # write out the file with mutated chromosomes + intersect_chrs.to_csv(output.chrs, index=False, header=False) # Retrieves from SM tag from BAM and writes to file @@ -123,13 +153,13 @@ rule _mutect2_get_sm: # The first function loads the wildcard-containing file path and additional args from the config. # The second replaces wildcards with those used in the rule. def _mutect2_get_interval_cli_arg( - vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], + vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"] ): def _mutect2_get_interval_cli_custom(wildcards, input): if vcf_in: param = f"-L {input.candidate_positions} {interval_arg_in}" - else: + else: param = "" return param return _mutect2_get_interval_cli_custom @@ -145,7 +175,8 @@ rule _mutect2_run_matched_unmatched: gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz"), normal_sm = CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{normal_id}_sm.txt", pon = reference_files("genomes/{genome_build}/gatk/mutect2_pon.{genome_build}.vcf.gz"), - candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output) + candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output), + capture_arg = _mutect_get_capspace output: vcf = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz"), tbi = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz.tbi"), @@ -172,7 +203,7 @@ rule _mutect2_run_matched_unmatched: -I {input.tumour_bam} -I {input.normal_bam} -R {input.fasta} -normal "$(cat {input.normal_sm})" -O {output.vcf} --germline-resource {input.gnomad} - -L {wildcards.chrom} {params.interval_arg} + -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg} -pon {input.pon} --f1r2-tar-gz {output.f1r2} > {log.stdout} 2> {log.stderr} """) @@ -186,7 +217,8 @@ rule _mutect2_run_no_normal: dict = reference_files("genomes/{genome_build}/genome_fasta/genome.dict"), gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz"), pon = reference_files("genomes/{genome_build}/gatk/mutect2_pon.{genome_build}.vcf.gz"), - candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output) + candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output), + capture_arg = _mutect_get_capspace output: vcf = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz"), tbi = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz.tbi"), @@ -200,7 +232,7 @@ rule _mutect2_run_no_normal: params: mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), opts = CFG["options"]["mutect2_run"], - interval_arg = _mutect2_get_interval_cli_arg() + interval_arg = _mutect2_get_interval_cli_arg() conda: CFG["conda_envs"]["gatk"] threads: @@ -212,7 +244,7 @@ rule _mutect2_run_no_normal: gatk Mutect2 --java-options "-Xmx{params.mem_mb}m" {params.opts} -I {input.tumour_bam} -R {input.fasta} -O {output.vcf} --germline-resource {input.gnomad} - -L {wildcards.chrom} {params.interval_arg} + -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg} -pon {input.pon} --f1r2-tar-gz {output.f1r2} > {log.stdout} 2> {log.stderr} """) diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk index 2fee054e..d48343b7 100644 --- a/modules/sage/1.0/sage.smk +++ b/modules/sage/1.0/sage.smk @@ -16,7 +16,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version @@ -121,15 +121,29 @@ def get_chromosomes(wildcards): chromosomes= ",".join(chromosomes) return chromosomes +def _sage_get_capspace(wildcards): + CFG = config["lcr-modules"]["sage"] + try: + # Get the appropriate capture space for this sample + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = reference_files(this_bed) + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + this_bed = rules._download_sage_references.output.panel_bed + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + this_bed = rules._download_sage_references.output.panel_bed + return this_bed + # Variant calling rule rule _run_sage: input: tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = str(rules._input_references.output.genome_fa), - hotspots = str(rules._download_sage_references.output.hotspots), - panel_bed = str(rules._download_sage_references.output.panel_bed), - high_conf_bed = str(rules._download_sage_references.output.high_conf_bed) + hotspots = rules._download_sage_references.output.hotspots, + high_conf_bed = str(rules._download_sage_references.output.high_conf_bed), + panel_bed = _sage_get_capspace output: vcf = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf"), vcf_gz = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf.gz") diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk index 5bebe48c..089fa085 100755 --- a/modules/strelka/1.1/strelka.smk +++ b/modules/strelka/1.1/strelka.smk @@ -16,7 +16,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version @@ -53,10 +53,7 @@ localrules: _strelka_configure_unpaired, _strelka_filter_combine, _strelka_output_filtered_vcf, - _strelka_all, - -wildcard_constraints: - var_type = "somatic.snvs|somatic.indels|variants" + _strelka_all ##### RULES ##### @@ -122,14 +119,29 @@ def _strelka_get_indel_cli_arg(vcf_in = config["lcr-modules"]["strelka"]["inputs return param return _strelka_get_indel_cli_custom +def _strelka_get_capspace(wildcards): + CFG = config["lcr-modules"]["strelka"] + try: + # Get the appropriate capture space for this sample + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = reference_files(this_bed) + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + + return this_bed + rule _strelka_configure_paired: # Somatic input: tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), - bedz = str(rules._strelka_index_bed.output.bedz), - indels = str(rules._strelka_input_vcf.output.vcf) if CFG["inputs"]["candidate_small_indels"] else str(rules._strelka_dummy_vcf.output) + indels = str(rules._strelka_input_vcf.output.vcf) if CFG["inputs"]["candidate_small_indels"] else str(rules._strelka_dummy_vcf.output), + bedz = _strelka_get_capspace output: runwf = CFG["dirs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" log: @@ -137,7 +149,7 @@ rule _strelka_configure_paired: # Somatic stderr = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_configure.stderr.log" params: indel_arg = _strelka_get_indel_cli_arg(), - opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]), + opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]) wildcard_constraints: pair_status = "matched|unmatched" conda: @@ -160,7 +172,7 @@ rule _strelka_configure_unpaired: # germline input: tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), - bedz = str(rules._strelka_index_bed.output.bedz) + bedz = _strelka_get_capspace output: runwf = CFG["dirs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" log: @@ -254,6 +266,8 @@ rule _strelka_filter_combine: log: stdout = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_filter_combine.stdout.log", stderr = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_filter_combine.stderr.log" + wildcard_constraints: + var_type = "somatic.snvs|somatic.indels|variants" shell: op.as_one_line(""" bcftools concat -a {input.vcf} | @@ -276,6 +290,7 @@ def _strelka_get_output(wildcards): vcf = str(rules._strelka_filter_combine.output.vcf) return vcf + # Symlinks the final output files into the module results directory (under '99-outputs/'). Links will always use "combined" in the name (dropping odd naming convention used by Strelka in unpaired mode) rule _strelka_output_filtered_vcf: input: diff --git a/modules/vcf2maf/1.2/vcf2maf.smk b/modules/vcf2maf/1.2/vcf2maf.smk index a53eee39..6d5e48b0 100644 --- a/modules/vcf2maf/1.2/vcf2maf.smk +++ b/modules/vcf2maf/1.2/vcf2maf.smk @@ -51,7 +51,7 @@ localrules: _vcf2maf_crossmap, _vcf2maf_all -VERSION_MAP = { +VCF2MAF_GENOME_VERSION_MAP = { "grch37": "GRCh37", "hg38": "GRCh38", "hs37d5": "GRCh37" @@ -89,7 +89,7 @@ rule _vcf2maf_run: stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log", params: opts = CFG["options"]["vcf2maf"], - build = lambda w: VERSION_MAP[w.genome_build], + build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build], custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"]) conda: CFG["conda_envs"]["vcf2maf"] diff --git a/modules/vcf2maf/1.3/vcf2maf.smk b/modules/vcf2maf/1.3/vcf2maf.smk index 17ba2f69..1cf2a63b 100644 --- a/modules/vcf2maf/1.3/vcf2maf.smk +++ b/modules/vcf2maf/1.3/vcf2maf.smk @@ -31,7 +31,7 @@ localrules: _vcf2maf_crossmap, _vcf2maf_all -VERSION_MAP = { +VCF2MAF_GENOME_VERSION_MAP = { "grch37": "GRCh37", "hg38": "GRCh38", "hs37d5": "GRCh37" @@ -83,7 +83,7 @@ rule _vcf2maf_run: stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log", params: opts = CFG["options"]["vcf2maf"], - build = lambda w: VERSION_MAP[w.genome_build], + build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build], custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"]) conda: CFG["conda_envs"]["vcf2maf"] @@ -115,7 +115,8 @@ rule _vcf2maf_run: --custom-enst {params.custom_enst} --retain-info gnomADg_AF >> {log.stdout} 2>> {log.stderr}; - else echo "WARNING: PATH is not set properly, using $(which vcf2maf.pl) will result in error during execution. Please ensure $VCF2MAF_SCRIPT exists." > {log.stderr};fi + else echo "WARNING: PATH is not set properly, using $(which vcf2maf.pl) will result in error during execution. Please ensure $VCF2MAF_SCRIPT exists." > {log.stderr};fi && + touch {output.vep} """) rule _vcf2maf_gnomad_filter_maf: diff --git a/oncopipe/oncopipe/__init__.py b/oncopipe/oncopipe/__init__.py index 3a2b31ef..f4afd253 100755 --- a/oncopipe/oncopipe/__init__.py +++ b/oncopipe/oncopipe/__init__.py @@ -9,6 +9,7 @@ import collections.abc from datetime import datetime from collections import defaultdict, namedtuple +from .__version__ import __version__ import yaml import pandas as pd @@ -1800,3 +1801,56 @@ def cleanup_module(module_config): # Add back the TSV fields for field in tsv_fields.keys(): module_config[field] = tsv_fields[field] + + +# Kostia functions +def get_capture_space(module_config, sample_id, genome_build, seq_type, return_ext): + """Returns path to the file with capspace to be used for genome build. + + Parameters + ---------- + module_config : dict + The module-specific configuration. + sample_id : str + The id for a specific sample for which capture space should be returned. + Allows for both normal and tumour id. + genome_build : str + The specific genome build for which to return capture space. + Allows to unambiguously handle samples aligned to different genome versions. + seq_type : str + The soecific seq type for which to return capture space. + return_ext : str + The extension of the capture space file to be returned (.bed, .vcf, .vcf.gz). + + Returns + ------- + str + The path to a file relative to the reference files parental directory. + """ + + # Convenient variable to access sample table + module_samples = module_config["samples"] + + this_sample = module_samples.loc[(module_samples['sample_id'] == sample_id) & + (module_samples['genome_build'] == genome_build) & + (module_samples['seq_type'] == seq_type)] + + if len(this_sample) != 1: + raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type)) + + if "capture_space" in this_sample.columns: + panel = this_sample.iloc[0]['capture_space'] + else: + panel = "none" + + # If this panel is "none" (aka not specified) use the default for this reference genome + if panel.upper() in (name.upper() for name in ['none', "na", "n/a", ""]): + try: + if "38" in genome_build: + panel = "exome-utr-grch38" + else: + panel = "exome-utr-grch37" + except KeyError as e: + raise AttributeError("No default capture space was specified for genome version \'%s\'. You can specify a default by setting \'default=\'true\'\' in a \'%s\'-based capture space in the reference config" % (genome_version, genome_version)) from e + # Now that we have found the corresponding capture region for this sample, obtain the requested file + return "genomes/" + genome_build + "/capture_space/" + panel + ".padded." + return_ext diff --git a/oncopipe/oncopipe/__version__.py b/oncopipe/oncopipe/__version__.py index a5673be8..8c861b0f 100644 --- a/oncopipe/oncopipe/__version__.py +++ b/oncopipe/oncopipe/__version__.py @@ -6,7 +6,7 @@ __title__ = "oncopipe" __description__ = "Functions for running Snakemake modules" -__version__ = "1.0.11" +__version__ = "1.0.12" __author__ = "Bruno Grande" __author_email__ = "bgrande@sfu.ca" __license__ = "MIT" diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml index 9c4d0a02..1ad75c50 100644 --- a/workflows/reference_files/2.4/config/default.yaml +++ b/workflows/reference_files/2.4/config/default.yaml @@ -47,6 +47,42 @@ genome_builds: version: "grch38" provider: "ucsc" genome_fasta_url: "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.fa.masked.gz" + hg19-reddy: + # Version of hg19 with chrM at the start, and chrM with a length of 16569bp. Used for the Reddy dataset + version: "grch37" + provider: "ucsc" + genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg19-reddy.fa" + hg38-nci: + # NCI's version of GRCh38, with a costco-sized flat pack of decoys + version: "grch38" + provider: "ucsc" + genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/grch38-nci.fa" + hg38-panea: + # Version of hg38 used by Panea et al. + version: "grch38" + provider: "ucsc" + genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg38-panea.fa" + +capture_space: + exome-utr-grch38: + genome: "grch38" + provider: "ucsc" + default: "true" + capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch38_all_genes.canonical.sort.bed4" + exome-utr-grch37: + genome: "grch37" + provider: "ensembl" + default: "true" + capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch37_all_genes.canonical.sort.bed4" + # to add custom capture space, follow this example and fill in dictionary keys + #name_of_capture_panel: + #genome: "grch37" or "hg38" as example + #provider: "ensembl" or "ucsc" + #capture_bed_url: here provide a link enclosed in "" to download bed file for capture panel if it is available in internet + #capture_bed_file: this key is optional and in "" you can specify path to a local bed file with capture panel + +capture_params: + padding_size: "200" wildcard_values: gencode_release: ["33"] @@ -57,6 +93,9 @@ wildcard_values: rm_version: ["hg19", "hg38"] tools: + bedtools: + conda_env: "envs/bedtools-2.29.2.yaml" + version: "2.29.2" coreutils: conda_env: "envs/coreutils-8.31.yaml" version: "8.31" @@ -66,6 +105,9 @@ tools: samtools: conda_env: "envs/samtools-1.9.yaml" version: "1.9" + tabix: + conda_env: "envs/tabix-0.2.6.yaml" + version: "0.2.6" bwa: conda_env: "envs/bwa-0.7.17.yaml" version: "0.7.17" diff --git a/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml b/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml new file mode 120000 index 00000000..c185b12a --- /dev/null +++ b/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml @@ -0,0 +1 @@ +../../../../envs/bedtools/bedtools-2.29.2.yaml \ No newline at end of file diff --git a/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml b/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml new file mode 120000 index 00000000..bcf351f2 --- /dev/null +++ b/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml @@ -0,0 +1 @@ +../../../../envs/tabix/tabix-0.2.6.yaml \ No newline at end of file diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk index 59d22ba8..f3e3a831 100644 --- a/workflows/reference_files/2.4/reference_files.smk +++ b/workflows/reference_files/2.4/reference_files.smk @@ -101,7 +101,7 @@ rule get_sdf_refs: output: sdf = directory("genomes/{genome_build}/sdf") wildcard_constraints: - genome_build = "hg38|hg19|grch37|hs37d5" + genome_build = "|".join(SDF_GENOME_BUILDS) shell: "ln -srfT {input.sdf} {output.sdf}" @@ -429,6 +429,130 @@ rule get_mutect2_small_exac: """) +### FOR HANDLING CAPTURE SPACE/TARGETED SEQUENCING DATA ### +# Added by Chris +# What this *should* do (if I have written these rules correctly) is obtain a capture space BED file +# check the contig names against the reference (and replace them as necessary), pad, sort,merge, and generate a bgzip/tabix +# indexed version of the BED as well as an interval list. + +def _check_capspace_provider(w): + # Checks to determine if both the capture space BED and associated reference genome + # are chr prefixed or not + + # If this is specified as the "default", make sure we obtain the relevent capture space + default_key = "default-" + w.genome_build + if default_key not in config["capture_space"] and w.capture_space == default_key: + # aka if the user hasn't explicitly specified a default using a default name + capture_space = _get_default_capspace(w) + else: + capture_space = w.capture_space + + genome_provider = config["genome_builds"][w.genome_build]["provider"] + bed_provider = config["capture_space"][capture_space]["provider"] + + # If the providers match (i.e. they share the same prefix), just use the downloaded version + if genome_provider == bed_provider: + return {'bed': expand(rules.download_capspace_bed.output.capture_bed, capture_space=capture_space, genome_build=w.genome_build)} + else: + # Prompt the prefix to be converted + chr_status = "chr" if genome_provider == "ucsc" else "no_chr" + return {'bed': expand(rules.add_remove_chr_prefix_bed.output.converted_bed, capture_space = capture_space, genome_build = w.genome_build, chr_status= chr_status)} + + +rule get_capspace_bed_download: + input: + unpack(_check_capspace_provider) + output: + capture_bed = "genomes/{genome_build}/capture_space/{capture_space}.bed" + conda: CONDA_ENVS["coreutils"] + shell: + "ln -srf {input.bed} {output.capture_bed}" + + +rule sort_and_pad_capspace: + input: + capture_bed = rules.get_capspace_bed_download.output.capture_bed, + fai = rules.index_genome_fasta.output.fai + output: + intermediate_bed = temp("genomes/{genome_build}/capture_space/{capture_space}.intermediate.bed"), + padded_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed" + params: + padding_size = config['capture_params']['padding_size'] # Default to 200. I would be warry of changing + log: + "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.log" + conda: CONDA_ENVS["bedtools"] + shell: + op.as_one_line(""" + cat {input.fai} | cut -f 1-2 | perl -ane 'print "$F[0]\\t0\\t$F[1]\\n"' | bedtools intersect -wa -a {input.capture_bed} -b stdin > {output.intermediate_bed} + && + bedtools slop -b {params.padding_size} -i {output.intermediate_bed} -g {input.fai} | bedtools sort | bedtools merge > {output.padded_bed} 2> {log} + """) + +rule check_capspace_contigs: + input: + bed = rules.get_capspace_bed_download.output.capture_bed, + fai = rules.index_genome_fasta.output.fai + output: + contig_log = "genomes/{genome_build}/capture_space/{capture_space}.check_contigs.log" + run: + # Parse BED contigs + bed_contigs = set() + with open(input.bed) as f: + for line in f: + line = line.rstrip() + contig = line.split("\t")[0] + if contig not in bed_contigs: + bed_contigs.add(contig) + + # Parse fai contigs + fai_contigs = set() + with open(input.fai) as f: + for line in f: + line = line.rstrip() + contig = line.split('\t')[0] + if contig not in fai_contigs: + fai_contigs.add(contig) + + # Check the BED file for contigs that are not in the reference genome + missing_contigs = list(x for x in bed_contigs if x not in fai_contigs) + with open(output.contig_log, "w") as o: + if len(missing_contigs) == 0: + o.write("No contigs missing from reference") + else: + o.write("The following contigs were missing from the reference\n") + o.write("\n".join(missing_contigs)) + o.write("\n") + + +rule compress_index_capspace_bed: + input: + capture_bed = rules.sort_and_pad_capspace.output.padded_bed + output: + bgzip_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz", + tabix = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz.tbi" + log: + "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz.log" + conda: CONDA_ENVS["tabix"] + shell: + op.as_one_line(""" + bgzip -c {input.capture_bed} > {output.bgzip_bed} + && + tabix -p bed {output.bgzip_bed} + """) + + +rule create_interval_list: + input: + bed = rules.sort_and_pad_capspace.output.padded_bed, + sd = rules.create_gatk_dict.output.dict + output: + interval_list = "genomes/{genome_build}/capture_space/{capture_space}.padded.interval_list" + log: + "genomes/{genome_build}/capture_space/{capture_space}.padded.interval_list.log" + conda: CONDA_ENVS["gatk"] + shell: + "gatk BedToIntervalList --INPUT {input.bed} -SD {input.sd} -O {output.interval_list} > {log} 2>&1" + ##### SigProfiler ##### rule download_sigprofiler_genome: @@ -459,3 +583,4 @@ rule install_sigprofiler_genome: run: op.relative_symlink(input, output.complete) + diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index c2f0514e..d2326396 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -44,6 +44,23 @@ VERSION_UPPER = { "grch38": "GRCh38", } +GENOME_VERSION_GROUPS = {} +GENOME_VERSION_MAP = {} +for genome_build in VERSION_UPPER.keys(): + GENOME_VERSION_GROUPS[genome_build] = [] + +# For Starfish SDF files +SDF_VERSION_MAP = {} +SDF_GENOME_BUILDS = [] +SDF_IGNORE = {"grch38", "grch38-legacy", "grch38_masked"} # Ignore non-chr prefixed versions of hg38 since we don't use them +sdf_genome_mappings = { +"GRCh37": {"ensembl": "1000g_v37_phase2.sdf", "ucsc": "hg19.sdf"}, +"GRCh38": {"ucsc": "GRCh38.sdf"} +} + + +DEFAULT_CAPSPACE = {} + # Check genome build versions, providers, and genome_fasta possible_versions = list(VERSION_UPPER.keys()) possible_providers = ["ensembl", "ucsc", "gencode", "ncbi"] @@ -51,7 +68,11 @@ for build_name, build_info in config["genome_builds"].items(): assert "version" in build_info and build_info["version"] in possible_versions, ( f"`version` not set for `{build_name}` or `version` not among {possible_versions}." ) - assert "provider" in build_info and build_info["provider"] in possible_providers, ( + GENOME_VERSION_GROUPS[build_info["version"]].append(build_name) + upper_genome_name = VERSION_UPPER[build_info["version"]] + GENOME_VERSION_MAP[build_name] = upper_genome_name + genome_provider = build_info["provider"] + assert "provider" in build_info and genome_provider in possible_providers, ( f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}." ) assert "genome_fasta_url" in build_info, f"`genome_fasta_url` not set for `{build_name}`." @@ -60,6 +81,36 @@ for build_name, build_info in config["genome_builds"].items(): f"Pinging `genome_fasta_url` for {build_name} returned HTTP code {url_code} " f"(rather than 200): \n{build_info['genome_fasta_url']}" ) + # Find the appropriate SDF file for this genome build + if build_name not in SDF_IGNORE: + SDF_GENOME_BUILDS.append(build_name) + try: + SDF_VERSION_MAP[build_name] = sdf_genome_mappings[upper_genome_name][genome_provider] + except KeyError as e: + raise AttributeError(f"Unable to locate a Starfish SDF file for genome build \'{upper_genome_name}\' and provider \'{genome_provider}\'") from e + +# Check parent genome, provider for the capture space +for build_name, build_info in config["capture_space"].items(): + assert "provider" in build_info and build_info["provider"] in possible_providers, ( + f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}." + ) + assert "genome" in build_info and build_info["genome"] in possible_versions,( + f"`genome` not set for `{build_name}` or `genome` not among {possible_versions}." ) + assert "capture_bed_url" in build_info + url_code = urllib.request.urlopen(build_info["capture_bed_url"]).getcode() + assert url_code == 200, ( + f"Pinging `capture_bed_url` for {build_name} returned HTTP code {url_code} " + f"(rather than 200): \n{build_info['capture_bed_url']}" + ) + if "default" in build_info: + assert build_info["default"].lower() in ["true", "false"], ( + f"true/false required for for \'default\' field" + ) + build_version = build_info["genome"] + if build_version in DEFAULT_CAPSPACE: + # i.e. a default has already been specified for this genome version! + raise AttributeError("For reference genome version \'%s\', both \'%s\' and \'%s\' were specified as default capture spaces in the reference config" % (build_version, DEFAULT_CAPSPACE[build_version], build_name)) + DEFAULT_CAPSPACE[build_info["genome"]] = build_name ##### TOOLS ##### @@ -122,16 +173,22 @@ for chrom_map_file in CHROM_MAPPINGS_FILES: rule download_genome_fasta: - output: + output: fasta = "downloads/genome_fasta/{genome_build}.fa" - log: + log: "downloads/genome_fasta/{genome_build}.fa.log" wildcard_constraints: genome_build = ".+(? {output.fasta} 2> {log}" + op.as_one_line(""" + if [ -e {params.path} ]; then + cat {params.path} > {output.fasta} 2> {log}; + else + curl -L {params.path} > {output.fasta} 2> {log}; + fi + """) rule download_masked_genome_fasta: output: @@ -345,13 +402,8 @@ rule download_liftover_chains: rule download_sdf: output: sdf = directory("downloads/sdf/{genome_build}/sdf") - params: - build = lambda w: { - "grch37": "1000g_v37_phase2.sdf", - "hs37d5": "1000g_v37_phase2.sdf", - "hg19": "hg19.sdf", - "hg38": "GRCh38.sdf" - }[w.genome_build] + params: + build = lambda w: SDF_VERSION_MAP[w.genome_build] shell: op.as_one_line(""" wget -qO {output.sdf}.zip https://s3.amazonaws.com/rtg-datasets/references/{params.build}.zip && @@ -377,10 +429,11 @@ def get_matching_download_rules(file): # At least one output file should produce num_matches = [] for output_file in r.output: - assert "{version}" in output_file, ( - f"The `{rule_name}` download rule doesn't have a `{{version}}` " - f"wildcard in the output file ('{output_file}')." - ) + if rule_name != "download_capspace_bed": # Workaround since we don't really care about the provider for the capture space + assert "{version}" in output_file, ( + f"The `{rule_name}` download rule doesn't have a `{{version}}` " + f"wildcard in the output file ('{output_file}')." + ) matches = smk.io.glob_wildcards(output_file, [file]) num_matches.append(len(matches[0])) if any(num > 0 for num in num_matches): @@ -476,6 +529,57 @@ def get_download_file(file): return get_download_file_custom +### CAPTURE SPACE ### +rule download_capspace_bed: + output: + capture_bed = "downloads/capture_space/{capture_space}.{genome_build}.bed" + log: + "downloads/capture_space/{capture_space}.{genome_build}.bed.log" + params: + path = lambda w: config["capture_space"][w.capture_space]["capture_bed_file"] if "capture_bed_file" in config["capture_space"][w.capture_space] else config["capture_space"][w.capture_space]["capture_bed_url"], + provider = lambda w: config["capture_space"][w.capture_space]["provider"] + shell: + op.as_one_line(""" + if [ -e {params.path} ]; then + cat {params.path} > {output.capture_bed} 2> {log}; + else + curl -L {params.path} > {output.capture_bed} 2> {log}; + fi + """) + +rule add_remove_chr_prefix_bed: + input: + capture_bed = rules.download_capspace_bed.output.capture_bed + output: + converted_bed = "downloads/capture_space/{capture_space}.{genome_build}.{chr_status}.bed" + run: + # Converts the specified BED file and adds/removes chr prefixes + if wildcards.chr_status == "chr": # i.e. we need to add a chr prefix + add_chr = True + else: + add_chr = False + + # Process the BED file + with open(input.capture_bed) as f, open(output.converted_bed, "w") as o: + i = 0 + for line in f: + i += 1 + # Make sure that this BED entry is chr-prefixed or not + if add_chr and line.startswith("chr"): + # We were asked to add a chr prefix, but one already exists + raise AttributeError("I was asked to add a \'chr\' prefix to \'%s\', but that BED is already \'chr\' prefixed on line %s" % (input.capture_bed, line)) + if not add_chr and not line.startswith("chr"): + # We were asked to remove a chr prefix, but there isn't one + raise AttributeError("I was asked to remove a \'chr\' prefix \'%s\', but it isn't chr prefixed on line %s" % (input.capture_bed, line)) + + if add_chr: + line = "chr" + line + else: + line = line.replace("chr", "") + + o.write(line) + + ##### SHARED #####