From ecaafe5980e50025131d599996a2a3106ea69720 Mon Sep 17 00:00:00 2001 From: ckrushton Date: Thu, 2 Sep 2021 13:24:32 -0700 Subject: [PATCH 01/38] Added support for capture space in reference workflow and slms-3 - Added a capture_space section to the reference config. This lists the name, download URL, and genome build version for that capture space - Every capture space is padded, sorted, merged, and the contig names updated for every genome_build of that version (ex. GRCh38, hg38) - Modified strelka, mutect, manta, sage, and lofreq to obtain the capture space if necessary. If the sample is not "capture", the original implemetation is used --- modules/lofreq/1.1/lofreq.smk | 25 ++- modules/manta/2.3/manta.smk | 17 +- modules/mutect2/2.0/mutect2.smk | 15 +- modules/sage/1.0/sage.smk | 20 ++- modules/strelka/1.1/strelka.smk | 16 +- .../reference_files/2.4/config/default.yaml | 25 +++ .../reference_files/2.4/reference_files.smk | 158 +++++++++++++++++- .../2.4/reference_files_header.smk | 74 ++++++++ 8 files changed, 332 insertions(+), 18 deletions(-) diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index 471590e5..01be0e37 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -47,8 +47,6 @@ SCRIPT_PATH = CFG['inputs']['src_dir'] bed = str(reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")) CFG['switches']['regions_bed']['_default'] = bed - - # Define rules to be run locally when using a compute cluster localrules: _lofreq_input_bam, @@ -56,6 +54,17 @@ localrules: _lofreq_all, _lofreq_link_to_preprocessed +def _lofreq_get_capspace(sample_id, wildcards): + + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + return CFG["switches"]["regions_bed"] + try: + # Get the appropriate capture space for this sample + return get_capture_space(sample_id, wildcards.genome_build, wildcards.seq_type, "bed") + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + return CFG["switches"]["regions_bed"] ##### RULES ##### @@ -83,7 +92,6 @@ rule _lofreq_preprocess_normal: normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect - bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"]) output: out_dir = directory(CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/"), preprocessing_start = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/preprocessing.started", @@ -97,7 +105,8 @@ rule _lofreq_preprocess_normal: stdout = CFG["logs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/lofreq_pre.stdout.log", stderr = CFG["logs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/lofreq_pre.stderr.log" params: - opts = CFG["options"]["lofreq"] + opts = CFG["options"]["lofreq"], + bed = lambda w: _lofreq_get_capspace(w.normal_id, w) conda: CFG["conda_envs"]["lofreq"] threads: @@ -114,7 +123,7 @@ rule _lofreq_preprocess_normal: touch {output.preprocessing_start} && lofreq somatic --normal_only {params.opts} --threads {threads} -t {input.normal_bam} -n {input.normal_bam} - -f {input.fasta} -o {output.out_dir}/ -d {input.dbsnp} --bed {input.bed} + -f {input.fasta} -o {output.out_dir}/ -d {input.dbsnp} --bed {params.bed} > {log.stdout} 2> {log.stderr} && touch {output.preprocessing_complete}; else echo "WARNING: PATH is not set properly, using $(which lofreq2_call_pparallel.py)"; fi @@ -162,7 +171,6 @@ rule _lofreq_run_tumour: vcf_relaxed = rules._lofreq_link_to_preprocessed.output.vcf_relaxed, fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect - bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"]) output: vcf_snvs_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz", vcf_indels_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.indels.vcf.gz", @@ -172,7 +180,8 @@ rule _lofreq_run_tumour: stdout = CFG["logs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/lofreq.stdout.log", stderr = CFG["logs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/lofreq.stderr.log" params: - opts = CFG["options"]["lofreq"] + opts = CFG["options"]["lofreq"], + bed = lambda w: _lofreq_get_capspace(w.tumour_id, w) conda: CFG["conda_envs"]["lofreq"] threads: @@ -187,7 +196,7 @@ rule _lofreq_run_tumour: if [[ $(which lofreq2_call_pparallel.py) =~ $SCRIPT ]]; then echo "using bundled patched script $SCRIPT"; lofreq somatic --continue {params.opts} --threads {threads} -t {input.tumour_bam} -n {input.normal_bam} - -f {input.fasta} -o $(dirname {output.vcf_snvs_filtered})/ -d {input.dbsnp} --bed {input.bed} + -f {input.fasta} -o $(dirname {output.vcf_snvs_filtered})/ -d {input.dbsnp} --bed {params.bed} > {log.stdout} 2> {log.stderr}; else echo "WARNING: PATH is not set properly, using $(which lofreq2_call_pparallel.py)"; fi """) diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk index e1ff97f6..562489fd 100644 --- a/modules/manta/2.3/manta.smk +++ b/modules/manta/2.3/manta.smk @@ -83,7 +83,8 @@ rule _manta_configure_paired: stderr = CFG["logs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/manta_configure.stderr.log" params: opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]), - tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"]) + tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"]), + bedz = lambda w:_manta_get_capspace(w) wildcard_constraints: pair_status = "matched|unmatched" conda: @@ -110,7 +111,8 @@ rule _manta_configure_unpaired: stderr = CFG["logs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/manta_configure.stderr.log" params: opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]), - tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"]) + tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"]), + bedz = lambda w: _manta_get_capspace(w) wildcard_constraints: pair_status = "no_normal" conda: @@ -272,6 +274,17 @@ def _manta_predict_output(wildcards): return outputs_with_bedpe + outputs_without_bedpe +def _manta_get_capspace(wildcards): + + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz") + try: + # Get the appropriate capture space for this sample + return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz") # Generates the target symlinks for each run depending on the Manta output VCF files rule _manta_dispatch: diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 9d43eb36..3bf5de7e 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -137,7 +137,7 @@ rule _mutect2_run_matched_unmatched: params: mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), opts = CFG["options"]["mutect2_run"], - interval_arg = _mutect2_get_interval_cli_arg() + interval_arg = lambda w: _mutect_get_capspace(w) conda: CFG["conda_envs"]["gatk"] threads: @@ -178,7 +178,7 @@ rule _mutect2_run_no_normal: params: mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), opts = CFG["options"]["mutect2_run"], - interval_arg = _mutect2_get_interval_cli_arg + interval_arg = lambda w: _mutect_get_capspace(w) conda: CFG["conda_envs"]["gatk"] threads: @@ -207,6 +207,17 @@ def _mutect2_get_chr_vcfs(wildcards): ) return(vcfs) +def _mutect_get_capspace(wildcards): + + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + return _mutect2_get_interval_cli_args() + try: + # Get the appropriate capture space for this sample + return " -L " + get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + return _mutect2_get_interval_cli_args() def _mutect2_get_chr_tbis(wildcards): CFG = config["lcr-modules"]["mutect2"] diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk index 2204cda7..ee088041 100644 --- a/modules/sage/1.0/sage.smk +++ b/modules/sage/1.0/sage.smk @@ -119,14 +119,25 @@ def get_chromosomes(wildcards): chromosomes= ",".join(chromosomes) return chromosomes +def _sage_get_capspace(wildcards): + + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + return rules._download_sage_references.output.panel_bed + try: + # Get the appropriate capture space for this sample + return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + return rules._download_sage_references.output.panel_bed + # Variant calling rule rule _run_sage: input: tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = str(rules._input_references.output.genome_fa), - hotspots = str(rules._download_sage_references.output.hotspots), - panel_bed = str(rules._download_sage_references.output.panel_bed), + hotspots = rules._download_sage_references.output.hotspots, high_conf_bed = str(rules._download_sage_references.output.high_conf_bed) output: vcf = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf"), @@ -139,7 +150,8 @@ rule _run_sage: chromosomes = get_chromosomes, assembly = lambda w: "hg38" if "38" in str({w.genome_build}) else "hg19", sage= "$(dirname $(readlink -e $(which SAGE)))/sage.jar", - jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8) + jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8), + panel_bed = lambda w: _sage_get_capspace(w) conda: CFG["conda_envs"]["sage"] threads: @@ -162,7 +174,7 @@ rule _run_sage: -assembly {params.assembly} -ref_genome {input.fasta} -hotspots {input.hotspots} - -panel_bed {input.panel_bed} + -panel_bed {params.panel_bed} -high_confidence_bed {input.high_conf_bed} -out {output.vcf} >> {log.stdout} 2>> {log.stderr} diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk index d7beac0a..fe8c429f 100755 --- a/modules/strelka/1.1/strelka.smk +++ b/modules/strelka/1.1/strelka.smk @@ -118,6 +118,7 @@ rule _strelka_configure_paired: # Somatic params: indel_arg = _strelka_get_indel_cli_arg(), opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]), + bedz = lambda w: _strelka_get_capspace(w) wildcard_constraints: pair_status = "matched|unmatched" conda: @@ -147,7 +148,8 @@ rule _strelka_configure_unpaired: # germline stdout = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_configure.stdout.log", stderr = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_configure.stderr.log" params: - opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]) + opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]), + bedz = lambda w: _strelka_get_capspace(w) message: "WARNING: {wildcards.seq_type} sample ({wildcards.tumour_id}) is being processed using Strelka Germline workflow. Ensure pairing config for capture is set to run_unpaired_tumours_with: 'unmatched_normal' to run Strelka Somatic workflow" wildcard_constraints: @@ -256,6 +258,18 @@ def _strelka_get_output(wildcards): vcf = str(rules._strelka_filter_combine.output.vcf) return vcf +def _strelka_get_capspace(wildcards): + + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz") + try: + # Get the appropriate capture space for this sample + return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + except NameError: + # If we are using an older version of the reference workflow, use the same region file as the genome sample + return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz") + # Symlinks the final output files into the module results directory (under '99-outputs/'). Links will always use "combined" in the name (dropping odd naming convention used by Strelka in unpaired mode) rule _strelka_output_filtered_vcf: input: diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml index 589bd908..7e6a5d26 100644 --- a/workflows/reference_files/2.4/config/default.yaml +++ b/workflows/reference_files/2.4/config/default.yaml @@ -27,6 +27,31 @@ genome_builds: version: "grch37" provider: "ensembl" genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hs37d5.fa" + hg19-reddy: + # Version of hg19 with chrM at the start, and chrM with a length of 16569bp. Used for the Reddy dataset + version: "grch37" + provider: "ucsc" + genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg19-reddy.fa" + grch38-nci: + # NCI's version of GRCh38, with a costco-sized flat pack of decoys + version: "grch38" + provider: "ucsc" + genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/grch38-nci.fa" + +capture_space: + exome-utr-grch38: + genome: "grch38" + provider: "ucsc" + default: "true" + capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch38_all_genes.canonical.sort.bed4" + exome-utr-grch37: + genome: "grch37" + provider: "ensembl" + default: "true" + capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch37_all_genes.canonical.sort.bed4" + +capture_params: + padding_size: "200" wildcard_values: gencode_release: ["33"] diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk index fe4a1d44..1745fb96 100644 --- a/workflows/reference_files/2.4/reference_files.smk +++ b/workflows/reference_files/2.4/reference_files.smk @@ -97,7 +97,7 @@ rule get_sdf_refs: output: sdf = directory("genomes/{genome_build}/sdf") wildcard_constraints: - genome_build = "hg38|hg19|grch37|hs37d5" + genome_build = "hg38|hg19|grch37|hs37d5|hg19-reddy" shell: "ln -srfT {input.sdf} {output.sdf}" @@ -381,3 +381,159 @@ rule get_mutect2_small_exac: && tabix {output.vcf} """) + + +### FOR HANDLING CAPTURE SPACE/TARGETED SEQUENCING DATA ### +# Added by Chris +# What this *should* do (if I have written these rules correctly) is obtain a capture space BED file +# check the contig names against the reference (and replace them as necessary), pad, sort,merge, and generate a bgzip/tabix +# indexed version of the BED as well as an interval list. + +def _check_capspace_provider(w): + # Checks to determine if both the capture space BED and associated reference genome + # are chr prefixed or not + + # If this is specified as the "default", make sure we obtain the relevent capture space + default_key = "default-" + w.genome_build + if default_key not in config["capture_space"] and w.capture_space == default_key: + # aka if the user hasn't explicitly specified a default using a default name + capture_space = _get_default_capspace(w) + else: + capture_space = w.capture_space + + genome_provider = config["genome_builds"][w.genome_build]["provider"] + bed_provider = config["capture_space"][capture_space]["provider"] + + # If the providers match (i.e. they share the same prefix), just use the downloaded version + if genome_provider == bed_provider: + return {'bed': expand(rules.download_capspace_bed.output.capture_bed, capture_space=capture_space, genome_build=w.genome_build)} + else: + # Prompt the prefix to be converted + chr_status = "chr" if genome_provider == "ucsc" else "no_chr" + return {'bed': expand(rules.add_remove_chr_prefix_bed.output.converted_bed, capture_space = capture_space, genome_build = w.genome_build, chr_status= chr_status)} + + +def get_capture_space(sample_id, genome_build, seq_type, return_ext): + """ + Obtain the corresponding capture space file (gz, interval list etc) for the specified sample + """ + + # Get the corresponding capture space for a given sample + sample_table = CFG["samples"] + sample = sample_table.loc[(sample_table['sample_id'] == sample_id) & + (sample_table['genome_build'] == genome_build) & + (sample_table['seq_type'] == seq_type)] + if len(sample) != 1: + raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type)) + panel = sample.iloc[0]['capture_space'] + + # If this panel is "none" (aka not specified) use the default for this reference genome + if panel == "none": + # Get the provider for this version of the reference genome + genome_version = None + for version, builds in GENOME_VERSION_GROUPS.items(): + if genome_build in builds: + assert genome_version is None # Failsafe, but this should never happen, as the reference workflow will not tolerate duplicates (I think) + genome_version = version + if genome_version is None: + raise AttributeError("Unable to find the corresponding genome version (ex. GRCh37) for build \'%s\'" % genome_build) + + try: + panel = DEFAULT_CAPSPACE[genome_version] + except KeyError as e: + raise AttributeError("No default capture space was specified for genome version \'%s\'. You can specify a default by setting \'default=\'true\'\' in a \'%s\'-based capture space in the reference config" % (genome_version, genome_version)) from e + + # Now that we have found the corresponding capture region for this sample, obtain the requested file + return reference_files("genomes/" + genome_build + "/capture_space/" + panel + ".padded." + return_ext) + + +rule get_capspace_bed_download: + input: + unpack(_check_capspace_provider) + output: + capture_bed = "genomes/{genome_build}/capture_space/{capture_space}.bed" + conda: CONDA_ENVS["coreutils"] + shell: + "ln -srf {input.bed} {output.capture_bed}" + + +rule sort_and_pad_capspace: + input: + capture_bed = rules.get_capspace_bed_download.output.capture_bed, + fai = rules.index_genome_fasta.output.fai + output: + padded_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed" + params: + padding_size = config['capture_params']['padding_size'] # Default to 200. I would be warry of changing + log: + "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.log" + conda: CONDA_ENVS["bedtools"] + shell: + "bedtools slop -b {params.padding_size} -i {input.capture_bed} -g {input.fai} | bedtools sort | bedtools merge > {output.padded_bed} 2> {log}" + + +rule check_capspace_contigs: + input: + bed = rules.get_capspace_bed_download.output.capture_bed, + fai = rules.index_genome_fasta.output.fai + output: + contig_log = "genomes/{genome_build}/capture_space/{capture_space}.check_contigs.log" + run: + # Parse BED contigs + bed_contigs = set() + with open(input.bed) as f: + for line in f: + line = line.rstrip() + contig = line.split("\t")[0] + if contig not in bed_contigs: + bed_contigs.add(contig) + + # Parse fai contigs + fai_contigs = set() + with open(input.fai) as f: + for line in f: + line = line.rstrip() + contig = line.split('\t')[0] + if contig not in fai_contigs: + fai_contigs.add(contig) + + # Check the BED file for contigs that are not in the reference genome + missing_contigs = list(x for x in bed_contigs if x not in fai_contigs) + with open(output.contig_log, "w") as o: + if len(missing_contigs) == 0: + o.write("No contigs missing from reference") + else: + o.write("The following contigs were missing from the reference\n") + o.write("\n".join(missing_contigs)) + o.write("\n") + + +rule compress_index_capspace_bed: + input: + capture_bed = rules.sort_and_pad_capspace.output.padded_bed + output: + bgzip_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz", + tabix = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz.tbi" + log: + "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz.log" + conda: CONDA_ENVS["tabix"] + shell: + op.as_one_line(""" + bgzip -c {input.capture_bed} > {output.bgzip_bed} + && + tabix -p bed {output.bgzip_bed} + """) + + +rule create_interval_list: + input: + bed = rules.sort_and_pad_capspace.output.padded_bed, + sd = rules.create_gatk_dict.output.dict + output: + interval_list = "genomes/{genome_build}/capture_space/{capture_space}.padded.interval_list" + log: + "genomes/{genome_build}/capture_space/{capture_space}.padded.interval_list.log" + conda: CONDA_ENVS["gatk"] + shell: + "gatk BedToIntervalList --INPUT {input.bed} -SD {input.sd} -O {output.interval_list} > {log} 2>&1" + diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index 11d6ffa1..9bf892ad 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -44,6 +44,12 @@ VERSION_UPPER = { "grch38": "GRCh38", } +GENOME_VERSION_GROUPS = {} +for genome_build in VERSION_UPPER.keys(): + GENOME_VERSION_GROUPS[genome_build] = [] + +DEFAULT_CAPSPACE = {} + # Check genome build versions, providers, and genome_fasta possible_versions = list(VERSION_UPPER.keys()) possible_providers = ["ensembl", "ucsc", "gencode", "ncbi"] @@ -61,6 +67,29 @@ for build_name, build_info in config["genome_builds"].items(): f"(rather than 200): \n{build_info['genome_fasta_url']}" ) +# Check parent genome, provider for the capture space +for build_name, build_info in config["capture_space"].items(): + assert "provider" in build_info and build_info["provider"] in possible_providers, ( + f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}." + ) + assert "genome" in build_info and build_info["genome"] in possible_versions,( + f"`genome` not set for `{build_name}` or `genome` not among {possible_versions}." ) + assert "capture_bed_url" in build_info + url_code = urllib.request.urlopen(build_info["capture_bed_url"]).getcode() + assert url_code == 200, ( + f"Pinging `capture_bed_url` for {build_name} returned HTTP code {url_code} " + f"(rather than 200): \n{build_info['capture_bed_url']}" + ) + if "default" in build_info: + assert build_info["default"].lower() in ["true", "false"], ( + f"true/false required for for \'default\' field" + ) + build_version = build_info["genome"] + if build_version in DEFAULT_CAPSPACE: + # i.e. a default has already been specified for this genome version! + raise AttributeError("For reference genome version \'%s\', both \'%s\' and \'%s\' were specified as default capture spaces in the reference config" % (build_version, DEFAULT_CAPSPACE[build_version], build_name)) + DEFAULT_CAPSPACE[build_info["genome"]] = build_name + ##### TOOLS ##### @@ -464,6 +493,51 @@ def get_download_file(file): return get_download_file_custom +### CAPTURE SPACE ### +rule download_capspace_bed: + output: + capture_bed = "downloads/capture_space/{capture_space}.{genome_build}.bed" + log: + "downloads/capture_space/{capture_space}.{genome_build}.bed.log" + params: + url = lambda w: config["capture_space"][w.capture_space]["capture_bed_url"], + provider = lambda w: config["capture_space"][w.capture_space]["provider"] + shell: + "curl -L {params.url} > {output.capture_bed} 2> {log}" + +rule add_remove_chr_prefix_bed: + input: + capture_bed = rules.download_capspace_bed.output.capture_bed + output: + converted_bed = "downloads/capture_space/{capture_space}.{genome_build}.{chr_status}.bed" + run: + # Converts the specified BED file and adds/removes chr prefixes + if wildcards.chr_status == "chr": # i.e. we need to add a chr prefix + add_chr = True + else: + add_chr = False + + # Process the BED file + with open(input.capture_bed) as f, open(output.converted_bed, "w") as o: + i = 0 + for line in f: + i += 1 + # Make sure that this BED entry is chr-prefixed or not + if add_chr and line.startswith("chr"): + # We were asked to add a chr prefix, but one already exists + raise AttributeError("I was asked to add a \'chr\' prefix to \'%s\', but that BED is already \'chr\' prefixed on line %s" % (input.capture_bed, line)) + if not add_chr and not line.startswith("chr"): + # We were asked to remove a chr prefix, but there isn't one + raise AttributeError("I was asked to remove a \'chr\' prefix \'%s\', but it isn't chr prefixed on line %s" % (input.capture_bed, line)) + + if add_chr: + line = "chr" + line + else: + line = line.replace("chr", "") + + o.write(line) + + ##### SHARED ##### From e2828f8b0d2de106a710ee9cc6ecec99b8623a41 Mon Sep 17 00:00:00 2001 From: ckrushton Date: Tue, 7 Sep 2021 09:52:59 -0700 Subject: [PATCH 02/38] Fixed bugs introduced during rebase, modified vcf2maf vcf2maf now works with custom reference genomes --- modules/manta/2.3/manta.smk | 6 ++--- modules/mutect2/2.0/mutect2.smk | 27 ++++++++++--------- modules/strelka/1.1/strelka.smk | 6 ++--- modules/vcf2maf/1.2/vcf2maf.smk | 4 +-- .../reference_files/2.4/config/default.yaml | 6 +++++ .../2.4/reference_files_header.smk | 13 ++++++--- 6 files changed, 36 insertions(+), 26 deletions(-) diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk index 562489fd..7ba36487 100644 --- a/modules/manta/2.3/manta.smk +++ b/modules/manta/2.3/manta.smk @@ -75,7 +75,6 @@ rule _manta_configure_paired: normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]), - bedz = str(rules._manta_index_bed.output.bedz) output: runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" log: @@ -91,7 +90,7 @@ rule _manta_configure_paired: CFG["conda_envs"]["manta"] shell: op.as_one_line(""" - configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {input.bedz} + configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {params.bedz} --runDir "$(dirname {output.runwf})" {params.tumour_bam_arg_name} {input.tumour_bam} --normalBam {input.normal_bam} --config {input.config} > {log.stdout} 2> {log.stderr} """) @@ -103,7 +102,6 @@ rule _manta_configure_unpaired: tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]), - bedz = str(rules._manta_index_bed.output.bedz) output: runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" log: @@ -119,7 +117,7 @@ rule _manta_configure_unpaired: CFG["conda_envs"]["manta"] shell: op.as_one_line(""" - configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {input.bedz} + configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {params.bedz} --runDir "$(dirname {output.runwf})" {params.tumour_bam_arg_name} {input.tumour_bam} --config {input.config} > {log.stdout} 2> {log.stderr} """) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 3bf5de7e..94c940b7 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -102,12 +102,12 @@ rule _mutect2_get_sm: # The second replaces wildcards with those used in the rule. def _mutect2_get_interval_cli_arg( vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], - interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"] + #interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"] ): def _mutect2_get_interval_cli_custom(wildcards, input): if vcf_in: - param = f"-L {input.candidate_positions} {interval_arg_in}" - else: + param = f"-L {input.candidate_positions}" + else: param = "" return param return _mutect2_get_interval_cli_custom @@ -137,7 +137,8 @@ rule _mutect2_run_matched_unmatched: params: mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), opts = CFG["options"]["mutect2_run"], - interval_arg = lambda w: _mutect_get_capspace(w) + interval_arg = _mutect2_get_interval_cli_arg(), + capture_arg = lambda w: _mutect_get_capspace(w) conda: CFG["conda_envs"]["gatk"] threads: @@ -150,7 +151,7 @@ rule _mutect2_run_matched_unmatched: -I {input.tumour_bam} -I {input.normal_bam} -R {input.fasta} -normal "$(cat {input.normal_sm})" -O {output.vcf} --germline-resource {input.gnomad} - -L {wildcards.chrom} {params.interval_arg} + -L {wildcards.chrom} {params.interval_arg} {params.capture_arg} -pon {input.pon} --f1r2-tar-gz {output.f1r2} > {log.stdout} 2> {log.stderr} """) @@ -178,7 +179,8 @@ rule _mutect2_run_no_normal: params: mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), opts = CFG["options"]["mutect2_run"], - interval_arg = lambda w: _mutect_get_capspace(w) + interval_arg = _mutect2_get_interval_cli_arg(), + capture_arg = lambda w: _mutect_get_capspace(w) conda: CFG["conda_envs"]["gatk"] threads: @@ -190,7 +192,7 @@ rule _mutect2_run_no_normal: gatk Mutect2 --java-options "-Xmx{params.mem_mb}m" {params.opts} -I {input.tumour_bam} -R {input.fasta} -O {output.vcf} --germline-resource {input.gnomad} - -L {wildcards.chrom} {params.interval_arg} + -L {wildcards.chrom} {params.interval_arg} {param.capture_arg} -pon {input.pon} --f1r2-tar-gz {output.f1r2} > {log.stdout} 2> {log.stderr} """) @@ -209,15 +211,16 @@ def _mutect2_get_chr_vcfs(wildcards): def _mutect_get_capspace(wildcards): - # If this is a genome sample, return a BED file listing all chromosomes + # If this isn't a capture sample, we don't have a capture space, so return nothing if wildcards.seq_type != "capture": - return _mutect2_get_interval_cli_args() + return "" try: # Get the appropriate capture space for this sample - return " -L " + get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") + cap_space = get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") + return " -L " + cap_space except NameError: - # If we are using an older version of the reference workflow, use the same region file as the genome sample - return _mutect2_get_interval_cli_args() + # If we are using an older version of the reference workflow, we don't need to do anything + return "" def _mutect2_get_chr_tbis(wildcards): CFG = config["lcr-modules"]["mutect2"] diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk index fe8c429f..d07bc04f 100755 --- a/modules/strelka/1.1/strelka.smk +++ b/modules/strelka/1.1/strelka.smk @@ -108,7 +108,6 @@ rule _strelka_configure_paired: # Somatic tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), - bedz = str(rules._strelka_index_bed.output.bedz), indels = str(rules._strelka_input_vcf.output.vcf) if CFG["inputs"]["candidate_small_indels"] else str(rules._strelka_dummy_vcf.output) output: runwf = CFG["dirs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" @@ -129,7 +128,7 @@ rule _strelka_configure_paired: # Somatic --normalBam={input.normal_bam} --tumorBam={input.tumour_bam} --referenceFasta={input.fasta} - --callRegions={input.bedz} + --callRegions={params.bedz} --runDir=$(dirname {output.runwf}) {params.indel_arg} {params.opts} @@ -141,7 +140,6 @@ rule _strelka_configure_unpaired: # germline input: tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), - bedz = str(rules._strelka_index_bed.output.bedz) output: runwf = CFG["dirs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py" log: @@ -161,7 +159,7 @@ rule _strelka_configure_unpaired: # germline configureStrelkaGermlineWorkflow.py --bam={input.tumour_bam} --referenceFasta={input.fasta} - --callRegions={input.bedz} + --callRegions={params.bedz} --runDir=$(dirname {output.runwf}) {params.opts} > {log.stdout} 2> {log.stderr} diff --git a/modules/vcf2maf/1.2/vcf2maf.smk b/modules/vcf2maf/1.2/vcf2maf.smk index d53c840b..ff352329 100644 --- a/modules/vcf2maf/1.2/vcf2maf.smk +++ b/modules/vcf2maf/1.2/vcf2maf.smk @@ -69,7 +69,7 @@ rule _vcf2maf_run: stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log", params: opts = CFG["options"]["vcf2maf"], - build = lambda w: VERSION_MAP[w.genome_build], + build = lambda w: GENOME_VERSION_MAP[w.genome_build], custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"]) conda: CFG["conda_envs"]["vcf2maf"] @@ -161,4 +161,4 @@ rule _vcf2maf_all: # Perform some clean-up tasks, including storing the module-specific # configuration on disk and deleting the `CFG` variable -op.cleanup_module(CFG) \ No newline at end of file +op.cleanup_module(CFG) diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml index 7e6a5d26..db023ba9 100644 --- a/workflows/reference_files/2.4/config/default.yaml +++ b/workflows/reference_files/2.4/config/default.yaml @@ -62,6 +62,9 @@ wildcard_values: rm_version: ["hg19", "hg38"] tools: + bedtools: + conda_env: "envs/bedtools-2.29.2.yaml" + version: "2.29.2" coreutils: conda_env: "envs/coreutils-8.31.yaml" version: "8.31" @@ -71,6 +74,9 @@ tools: samtools: conda_env: "envs/samtools-1.9.yaml" version: "1.9" + tabix: + conda_env: "envs/tabix-0.2.6.yaml" + version: "0.2.6" bwa: conda_env: "envs/bwa-0.7.17.yaml" version: "0.7.17" diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index 9bf892ad..a966be7a 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -45,6 +45,7 @@ VERSION_UPPER = { } GENOME_VERSION_GROUPS = {} +GENOME_VERSION_MAP = {} for genome_build in VERSION_UPPER.keys(): GENOME_VERSION_GROUPS[genome_build] = [] @@ -57,6 +58,9 @@ for build_name, build_info in config["genome_builds"].items(): assert "version" in build_info and build_info["version"] in possible_versions, ( f"`version` not set for `{build_name}` or `version` not among {possible_versions}." ) + GENOME_VERSION_GROUPS[build_info["version"]].append(build_name) + upper_genome_name = VERSION_UPPER[build_info["version"]] + GENOME_VERSION_MAP[build_name] = upper_genome_name assert "provider" in build_info and build_info["provider"] in possible_providers, ( f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}." ) @@ -394,10 +398,11 @@ def get_matching_download_rules(file): # At least one output file should produce num_matches = [] for output_file in r.output: - assert "{version}" in output_file, ( - f"The `{rule_name}` download rule doesn't have a `{{version}}` " - f"wildcard in the output file ('{output_file}')." - ) + if rule_name != "download_capspace_bed": # Workaround since we don't really care about the provider for the capture space + assert "{version}" in output_file, ( + f"The `{rule_name}` download rule doesn't have a `{{version}}` " + f"wildcard in the output file ('{output_file}')." + ) matches = smk.io.glob_wildcards(output_file, [file]) num_matches.append(len(matches[0])) if any(num > 0 for num in num_matches): From 52d74823eb31e65955f5c2afc3eba4ef548babe0 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 3 Dec 2021 16:09:06 -0800 Subject: [PATCH 03/38] drop unnecessary change --- modules/slms_3/1.0/config/default.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/slms_3/1.0/config/default.yaml b/modules/slms_3/1.0/config/default.yaml index 718fb8f2..32157c8e 100644 --- a/modules/slms_3/1.0/config/default.yaml +++ b/modules/slms_3/1.0/config/default.yaml @@ -110,7 +110,7 @@ lcr-modules: scratch_subdirectories: [] options: - sage_run: "-validation_stringency LENIENT" + sage_run: "" resources: sage_run: From eb56fe54ec7c5f479e4948daae70a4fc99456665 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Mon, 6 Dec 2021 12:17:32 -0800 Subject: [PATCH 04/38] allow the widcard switch for custom bed in lofreq --- modules/lofreq/1.1/lofreq.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index 0a65b358..a6cde654 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -60,15 +60,16 @@ localrules: def _lofreq_get_capspace(wildcards): + custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type] # If this is a genome sample, return a BED file listing all chromosomes if wildcards.seq_type != "capture": - return str(config["lcr-modules"]["lofreq"]["switches"]["regions_bed"]['_default']) + return str(custom_bed) if custom_bed else str(bed) try: # Get the appropriate capture space for this sample return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed") except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample - return str(config["lcr-modules"]["lofreq"]["switches"]["regions_bed"]['_default']) + return str(custom_bed) if custom_bed else str(bed) ##### RULES ##### From 79b5286a1edf63f12d2f588b12bc94bea0a242c8 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Mon, 6 Dec 2021 12:21:31 -0800 Subject: [PATCH 05/38] remove commented line used for testing purposes --- modules/mutect2/2.0/mutect2.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 52ff9fcb..0bfdc14e 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -123,8 +123,7 @@ rule _mutect2_get_sm: # The first function loads the wildcard-containing file path and additional args from the config. # The second replaces wildcards with those used in the rule. def _mutect2_get_interval_cli_arg( - vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], - #interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"] + vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"] ): def _mutect2_get_interval_cli_custom(wildcards, input): if vcf_in: From 57adf033c256b2ec11e3576e67e93ec2e17aa615 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Mon, 6 Dec 2021 12:26:14 -0800 Subject: [PATCH 06/38] rename global variable to be module-specific --- modules/vcf2maf/1.2/vcf2maf.smk | 4 ++-- modules/vcf2maf/1.3/vcf2maf.smk | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/vcf2maf/1.2/vcf2maf.smk b/modules/vcf2maf/1.2/vcf2maf.smk index 2b357647..6d5e48b0 100644 --- a/modules/vcf2maf/1.2/vcf2maf.smk +++ b/modules/vcf2maf/1.2/vcf2maf.smk @@ -51,7 +51,7 @@ localrules: _vcf2maf_crossmap, _vcf2maf_all -VERSION_MAP = { +VCF2MAF_GENOME_VERSION_MAP = { "grch37": "GRCh37", "hg38": "GRCh38", "hs37d5": "GRCh37" @@ -89,7 +89,7 @@ rule _vcf2maf_run: stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log", params: opts = CFG["options"]["vcf2maf"], - build = lambda w: GENOME_VERSION_MAP[w.genome_build], + build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build], custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"]) conda: CFG["conda_envs"]["vcf2maf"] diff --git a/modules/vcf2maf/1.3/vcf2maf.smk b/modules/vcf2maf/1.3/vcf2maf.smk index 17ba2f69..4a38f18d 100644 --- a/modules/vcf2maf/1.3/vcf2maf.smk +++ b/modules/vcf2maf/1.3/vcf2maf.smk @@ -31,7 +31,7 @@ localrules: _vcf2maf_crossmap, _vcf2maf_all -VERSION_MAP = { +VCF2MAF_GENOME_VERSION_MAP = { "grch37": "GRCh37", "hg38": "GRCh38", "hs37d5": "GRCh37" @@ -83,7 +83,7 @@ rule _vcf2maf_run: stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log", params: opts = CFG["options"]["vcf2maf"], - build = lambda w: VERSION_MAP[w.genome_build], + build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build], custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"]) conda: CFG["conda_envs"]["vcf2maf"] From 7611f23ec214bd7d646e24bd34ab4d9a34b213de Mon Sep 17 00:00:00 2001 From: Kdreval Date: Mon, 6 Dec 2021 12:50:00 -0800 Subject: [PATCH 07/38] make global wildcard constraints rule-specific --- modules/strelka/1.1/strelka.smk | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk index 7de97e6a..439d17ae 100755 --- a/modules/strelka/1.1/strelka.smk +++ b/modules/strelka/1.1/strelka.smk @@ -53,10 +53,7 @@ localrules: _strelka_configure_unpaired, _strelka_filter_combine, _strelka_output_filtered_vcf, - _strelka_all, - -wildcard_constraints: - var_type = "somatic.snvs|somatic.indels|variants" + _strelka_all ##### RULES ##### @@ -266,6 +263,8 @@ rule _strelka_filter_combine: log: stdout = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_filter_combine.stdout.log", stderr = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_filter_combine.stderr.log" + wildcard_constraints: + var_type = "somatic.snvs|somatic.indels|variants" shell: op.as_one_line(""" bcftools concat -a {input.vcf} | From 783515cddefdb6c23f361108a5467823a7a447a5 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Mon, 6 Dec 2021 13:48:32 -0800 Subject: [PATCH 08/38] handle edge case when dictionary key is missing --- modules/lofreq/1.1/lofreq.smk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index a6cde654..ea92050a 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -60,7 +60,10 @@ localrules: def _lofreq_get_capspace(wildcards): - custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type] + if str(wildcards.seq_type) in config["lcr-modules"]["lofreq"]['switches']['regions_bed'].keys(): + custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type] + else: + custom_bed = bed # If this is a genome sample, return a BED file listing all chromosomes if wildcards.seq_type != "capture": return str(custom_bed) if custom_bed else str(bed) From e179f92f4722508048a0060de2ae7d58ab7f9fab Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 7 Dec 2021 11:41:41 -0800 Subject: [PATCH 09/38] implement fix for empty vcf files from previous version --- modules/vcf2maf/1.3/vcf2maf.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/vcf2maf/1.3/vcf2maf.smk b/modules/vcf2maf/1.3/vcf2maf.smk index 4a38f18d..1cf2a63b 100644 --- a/modules/vcf2maf/1.3/vcf2maf.smk +++ b/modules/vcf2maf/1.3/vcf2maf.smk @@ -115,7 +115,8 @@ rule _vcf2maf_run: --custom-enst {params.custom_enst} --retain-info gnomADg_AF >> {log.stdout} 2>> {log.stderr}; - else echo "WARNING: PATH is not set properly, using $(which vcf2maf.pl) will result in error during execution. Please ensure $VCF2MAF_SCRIPT exists." > {log.stderr};fi + else echo "WARNING: PATH is not set properly, using $(which vcf2maf.pl) will result in error during execution. Please ensure $VCF2MAF_SCRIPT exists." > {log.stderr};fi && + touch {output.vep} """) rule _vcf2maf_gnomad_filter_maf: From 0028ec5eca05915662076cf161d77abfb5ccb4cb Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 7 Dec 2021 12:46:05 -0800 Subject: [PATCH 10/38] add missing genome build to config --- workflows/reference_files/2.4/config/default.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml index a8fb6fd0..ee4ece6e 100644 --- a/workflows/reference_files/2.4/config/default.yaml +++ b/workflows/reference_files/2.4/config/default.yaml @@ -57,6 +57,11 @@ genome_builds: version: "grch38" provider: "ucsc" genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/grch38-nci.fa" + hg38-panea: + # Version of hg38 used by Panea et al. + version: "grch38" + provider: "ucsc" + genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg38-panea.fa" capture_space: exome-utr-grch38: From 7abd27f9fc65564de7542e31ab7ae4d90d638a9a Mon Sep 17 00:00:00 2001 From: Kdreval Date: Thu, 30 Dec 2021 08:23:32 -0800 Subject: [PATCH 11/38] handle panea sdf files --- modules/starfish/2.0/starfish.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/starfish/2.0/starfish.smk b/modules/starfish/2.0/starfish.smk index d3213e5a..9ead0453 100644 --- a/modules/starfish/2.0/starfish.smk +++ b/modules/starfish/2.0/starfish.smk @@ -112,7 +112,7 @@ rule _starfish_run: CFG["dirs"]["inputs"] + "vcf/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{caller}.vcf.gz.tbi", caller = callers ), - reference = ancient(reference_files("genomes/{genome_build}/sdf")), + reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build=="hg38-panea" else ancient(reference_files("genomes/hg38/sdf")), starfish_script = CFG["inputs"]["starfish_script"] output: complete = touch(CFG["dirs"]["starfish"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/starfish.complete"), From 4b7d69e31c8fa31f1f8203f39cbcad92a947eacc Mon Sep 17 00:00:00 2001 From: Kdreval Date: Thu, 30 Dec 2021 08:29:40 -0800 Subject: [PATCH 12/38] unwrap reference file from str --- modules/lofreq/1.1/lofreq.smk | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index ea92050a..ea5eb997 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -59,20 +59,21 @@ localrules: _lofreq_link_to_preprocessed def _lofreq_get_capspace(wildcards): - - if str(wildcards.seq_type) in config["lcr-modules"]["lofreq"]['switches']['regions_bed'].keys(): - custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type] + CFG=config["lcr-modules"]["lofreq"] + default_bed = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed") + if str(wildcards.seq_type) in CFG['switches']['regions_bed'].keys(): + custom_bed = CFG['switches']['regions_bed'][wildcards.seq_type] else: - custom_bed = bed + custom_bed = default_bed # If this is a genome sample, return a BED file listing all chromosomes if wildcards.seq_type != "capture": - return str(custom_bed) if custom_bed else str(bed) + return custom_bed if custom_bed else default_bed try: # Get the appropriate capture space for this sample return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed") except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample - return str(custom_bed) if custom_bed else str(bed) + return custom_bed if custom_bed else default_bed ##### RULES ##### From fd3acfe2e365bcfcfd7901ad55ea7da828194677 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 7 Jan 2022 15:47:48 -0800 Subject: [PATCH 13/38] rename grch38-nci into ng38-nci --- modules/starfish/2.0/starfish.smk | 2 +- workflows/reference_files/2.4/config/default.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/starfish/2.0/starfish.smk b/modules/starfish/2.0/starfish.smk index 9ead0453..f63d6988 100644 --- a/modules/starfish/2.0/starfish.smk +++ b/modules/starfish/2.0/starfish.smk @@ -112,7 +112,7 @@ rule _starfish_run: CFG["dirs"]["inputs"] + "vcf/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{caller}.vcf.gz.tbi", caller = callers ), - reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build=="hg38-panea" else ancient(reference_files("genomes/hg38/sdf")), + reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build in ["hg38-panea", "hg38-nci"] else ancient(reference_files("genomes/hg38/sdf")), starfish_script = CFG["inputs"]["starfish_script"] output: complete = touch(CFG["dirs"]["starfish"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/starfish.complete"), diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml index ee4ece6e..a3232434 100644 --- a/workflows/reference_files/2.4/config/default.yaml +++ b/workflows/reference_files/2.4/config/default.yaml @@ -52,7 +52,7 @@ genome_builds: version: "grch37" provider: "ucsc" genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg19-reddy.fa" - grch38-nci: + hg38-nci: # NCI's version of GRCh38, with a costco-sized flat pack of decoys version: "grch38" provider: "ucsc" From c2fdf033378d8e622a8e6a89c057c06773b00352 Mon Sep 17 00:00:00 2001 From: ckrushton Date: Mon, 10 Jan 2022 10:20:02 -0800 Subject: [PATCH 14/38] Added support for generating SDF files with additional genome builds --- .../reference_files/2.4/reference_files.smk | 4 +-- .../2.4/reference_files_header.smk | 33 +++++++++++++------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk index 1745fb96..e12abc24 100644 --- a/workflows/reference_files/2.4/reference_files.smk +++ b/workflows/reference_files/2.4/reference_files.smk @@ -97,7 +97,7 @@ rule get_sdf_refs: output: sdf = directory("genomes/{genome_build}/sdf") wildcard_constraints: - genome_build = "hg38|hg19|grch37|hs37d5|hg19-reddy" + genome_build = "|".join(SDF_GENOME_BUILDS) shell: "ln -srfT {input.sdf} {output.sdf}" @@ -428,7 +428,7 @@ def get_capture_space(sample_id, genome_build, seq_type, return_ext): panel = sample.iloc[0]['capture_space'] # If this panel is "none" (aka not specified) use the default for this reference genome - if panel == "none": + if panel == "none" or panel is None or panel == "": # Get the provider for this version of the reference genome genome_version = None for version, builds in GENOME_VERSION_GROUPS.items(): diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index a966be7a..a5372e87 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -45,10 +45,20 @@ VERSION_UPPER = { } GENOME_VERSION_GROUPS = {} -GENOME_VERSION_MAP = {} +VCF2MAF_GENOME_VERSION_MAP = {} for genome_build in VERSION_UPPER.keys(): GENOME_VERSION_GROUPS[genome_build] = [] +# For Starfish SDF files +SDF_VERSION_MAP = {} +SDF_GENOME_BUILDS = [] +SDF_IGNORE = {"grch38", "grch38-legacy"} # Ignore non-chr prefixed versions of hg38 since we don't use them +sdf_genome_mappings = { +"GRCh37": {"ensembl": "1000g_v37_phase2.sdf", "ucsc": "hg19.sdf"}, +"GRCh38": {"ucsc": "GRCh38.sdf"} +} + + DEFAULT_CAPSPACE = {} # Check genome build versions, providers, and genome_fasta @@ -60,8 +70,9 @@ for build_name, build_info in config["genome_builds"].items(): ) GENOME_VERSION_GROUPS[build_info["version"]].append(build_name) upper_genome_name = VERSION_UPPER[build_info["version"]] - GENOME_VERSION_MAP[build_name] = upper_genome_name - assert "provider" in build_info and build_info["provider"] in possible_providers, ( + VCF2MAF_GENOME_VERSION_MAP[build_name] = upper_genome_name + genome_provider = build_info["provider"] + assert "provider" in build_info and genome_provider in possible_providers, ( f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}." ) assert "genome_fasta_url" in build_info, f"`genome_fasta_url` not set for `{build_name}`." @@ -70,6 +81,13 @@ for build_name, build_info in config["genome_builds"].items(): f"Pinging `genome_fasta_url` for {build_name} returned HTTP code {url_code} " f"(rather than 200): \n{build_info['genome_fasta_url']}" ) + # Find the appropriate SDF file for this genome build + if build_name not in SDF_IGNORE: + SDF_GENOME_BUILDS.append(build_name) + try: + SDF_VERSION_MAP[build_name] = sdf_genome_mappings[upper_genome_name][genome_provider] + except KeyError as e: + raise AttributeError(f"Unable to locate a Starfish SDF file for genome build \'{upper_genome_name}\' and provider \'{genome_provider}\'") from e # Check parent genome, provider for the capture space for build_name, build_info in config["capture_space"].items(): @@ -366,13 +384,8 @@ rule download_liftover_chains: rule download_sdf: output: sdf = directory("downloads/sdf/{genome_build}/sdf") - params: - build = lambda w: { - "grch37": "1000g_v37_phase2.sdf", - "hs37d5": "1000g_v37_phase2.sdf", - "hg19": "hg19.sdf", - "hg38": "GRCh38.sdf" - }[w.genome_build] + params: + build = lambda w: SDF_VERSION_MAP[w.genome_build] shell: op.as_one_line(""" wget -qO {output.sdf}.zip https://s3.amazonaws.com/rtg-datasets/references/{params.build}.zip && From b956c089dfec111f2b63ba8f42c2c422b9e7eb7d Mon Sep 17 00:00:00 2001 From: ckrushton Date: Mon, 10 Jan 2022 10:20:44 -0800 Subject: [PATCH 15/38] Added support for additional genome builds --- modules/vcf2maf/1.2/vcf2maf.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/vcf2maf/1.2/vcf2maf.smk b/modules/vcf2maf/1.2/vcf2maf.smk index ff352329..9163c5e9 100644 --- a/modules/vcf2maf/1.2/vcf2maf.smk +++ b/modules/vcf2maf/1.2/vcf2maf.smk @@ -69,7 +69,7 @@ rule _vcf2maf_run: stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log", params: opts = CFG["options"]["vcf2maf"], - build = lambda w: GENOME_VERSION_MAP[w.genome_build], + build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build], custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"]) conda: CFG["conda_envs"]["vcf2maf"] @@ -91,7 +91,8 @@ rule _vcf2maf_run: --vep-data {input.vep_cache} --vep-path $vepPATH {params.opts} --custom-enst {params.custom_enst} - > {log.stdout} 2> {log.stderr} + > {log.stdout} 2> {log.stderr} && + touch {output.vep} """) From 5d08414fed199152c77b3554bee5efc55d21fb60 Mon Sep 17 00:00:00 2001 From: ckrushton Date: Mon, 10 Jan 2022 10:51:09 -0800 Subject: [PATCH 16/38] Ignore grch38-masked when generating SDF files --- workflows/reference_files/2.4/reference_files_header.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index 0f6f0073..74e4c028 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -52,7 +52,7 @@ for genome_build in VERSION_UPPER.keys(): # For Starfish SDF files SDF_VERSION_MAP = {} SDF_GENOME_BUILDS = [] -SDF_IGNORE = {"grch38", "grch38-legacy"} # Ignore non-chr prefixed versions of hg38 since we don't use them +SDF_IGNORE = {"grch38", "grch38-legacy", "grch38_masked"} # Ignore non-chr prefixed versions of hg38 since we don't use them sdf_genome_mappings = { "GRCh37": {"ensembl": "1000g_v37_phase2.sdf", "ucsc": "hg19.sdf"}, "GRCh38": {"ucsc": "GRCh38.sdf"} From da52375a76d03c99cbe7f2b208258285ab929c47 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 05:40:13 -0800 Subject: [PATCH 17/38] drop vcf2maf from variable name --- workflows/reference_files/2.4/reference_files_header.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index 74e4c028..9c1581d3 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -45,7 +45,7 @@ VERSION_UPPER = { } GENOME_VERSION_GROUPS = {} -VCF2MAF_GENOME_VERSION_MAP = {} +GENOME_VERSION_MAP = {} for genome_build in VERSION_UPPER.keys(): GENOME_VERSION_GROUPS[genome_build] = [] @@ -70,7 +70,7 @@ for build_name, build_info in config["genome_builds"].items(): ) GENOME_VERSION_GROUPS[build_info["version"]].append(build_name) upper_genome_name = VERSION_UPPER[build_info["version"]] - VCF2MAF_GENOME_VERSION_MAP[build_name] = upper_genome_name + GENOME_VERSION_MAP[build_name] = upper_genome_name genome_provider = build_info["provider"] assert "provider" in build_info and genome_provider in possible_providers, ( f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}." From d05536418f3ae2b87e3db7bcb34ae7b99ce5ef81 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 05:44:02 -0800 Subject: [PATCH 18/38] add capspace function to oncopipe --- oncopipe/oncopipe/__init__.py | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/oncopipe/oncopipe/__init__.py b/oncopipe/oncopipe/__init__.py index cc08d043..fe3d828f 100755 --- a/oncopipe/oncopipe/__init__.py +++ b/oncopipe/oncopipe/__init__.py @@ -1781,3 +1781,52 @@ def cleanup_module(module_config): # Add back the TSV fields for field in tsv_fields.keys(): module_config[field] = tsv_fields[field] + + +# Kostia functions +def get_capture_space(module_config, sample_id, genome_build, seq_type, return_ext): + """Returns path to the file with capspace to be used for genome build. + + Parameters + ---------- + module_config : dict + The module-specific configuration. + sample_id : str + The id for a specific sample for which capture space should be returned. + Allows for both normal and tumour id. + genome_build : str + The specific genome build for which to return capture space. + Allows to unambiguously handle samples aligned to different genome versions. + seq_type : str + The soecific seq type for which to return capture space. + return_ext : str + The extension of the capture space file to be returned (.bed, .vcf, .vcf.gz). + + Returns + ------- + str + The path to a file relative to the reference files parental directory. + """ + + # Convenient variable to access sample table + module_samples = module_config["samples"] + + this_sample = module_samples.loc[(module_samples['sample_id'] == sample_id) & + (module_samples['genome_build'] == genome_build) & + (module_samples['seq_type'] == seq_type)] + + if len(this_sample) != 1: + raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type)) + panel = this_sample.iloc[0]['capture_space'] + + # If this panel is "none" (aka not specified) use the default for this reference genome + if panel.upper() in (name.upper() for name in ['none', "na", "n/a", ""]): + try: + if "38" in genome_build: + panel = "exome-utr-grch38" + else: + panel = "exome-utr-grch37" + except KeyError as e: + raise AttributeError("No default capture space was specified for genome version \'%s\'. You can specify a default by setting \'default=\'true\'\' in a \'%s\'-based capture space in the reference config" % (genome_version, genome_version)) from e + # Now that we have found the corresponding capture region for this sample, obtain the requested file + return "genomes/" + genome_build + "/capture_space/" + panel + ".padded." + return_ext From 1f00b34ceb591cd036d0e8dad80d106e9b248e6b Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 05:46:20 -0800 Subject: [PATCH 19/38] drop redundant function from reference files --- .../reference_files/2.4/reference_files.smk | 34 ------------------- 1 file changed, 34 deletions(-) diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk index 39088241..7962f0d2 100644 --- a/workflows/reference_files/2.4/reference_files.smk +++ b/workflows/reference_files/2.4/reference_files.smk @@ -459,40 +459,6 @@ def _check_capspace_provider(w): return {'bed': expand(rules.add_remove_chr_prefix_bed.output.converted_bed, capture_space = capture_space, genome_build = w.genome_build, chr_status= chr_status)} -def get_capture_space(sample_id, genome_build, seq_type, return_ext): - """ - Obtain the corresponding capture space file (gz, interval list etc) for the specified sample - """ - - # Get the corresponding capture space for a given sample - sample_table = CFG["samples"] - sample = sample_table.loc[(sample_table['sample_id'] == sample_id) & - (sample_table['genome_build'] == genome_build) & - (sample_table['seq_type'] == seq_type)] - if len(sample) != 1: - raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type)) - panel = sample.iloc[0]['capture_space'] - - # If this panel is "none" (aka not specified) use the default for this reference genome - if panel == "none" or panel is None or panel == "": - # Get the provider for this version of the reference genome - genome_version = None - for version, builds in GENOME_VERSION_GROUPS.items(): - if genome_build in builds: - assert genome_version is None # Failsafe, but this should never happen, as the reference workflow will not tolerate duplicates (I think) - genome_version = version - if genome_version is None: - raise AttributeError("Unable to find the corresponding genome version (ex. GRCh37) for build \'%s\'" % genome_build) - - try: - panel = DEFAULT_CAPSPACE[genome_version] - except KeyError as e: - raise AttributeError("No default capture space was specified for genome version \'%s\'. You can specify a default by setting \'default=\'true\'\' in a \'%s\'-based capture space in the reference config" % (genome_version, genome_version)) from e - - # Now that we have found the corresponding capture region for this sample, obtain the requested file - return reference_files("genomes/" + genome_build + "/capture_space/" + panel + ".padded." + return_ext) - - rule get_capspace_bed_download: input: unpack(_check_capspace_provider) From 9ac63c989975eeb55aedbc198cbd30a3bca355e4 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 06:02:22 -0800 Subject: [PATCH 20/38] switch modules to oncopipe finction --- modules/lofreq/1.1/lofreq.smk | 9 +++++++-- modules/manta/2.3/manta.smk | 10 ++++++---- modules/mutect2/2.0/mutect2.smk | 4 ++-- modules/sage/1.0/sage.smk | 10 ++++++---- modules/strelka/1.1/strelka.smk | 11 +++++++---- 5 files changed, 28 insertions(+), 16 deletions(-) diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index ea5eb997..8f120b93 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -69,11 +69,16 @@ def _lofreq_get_capspace(wildcards): if wildcards.seq_type != "capture": return custom_bed if custom_bed else default_bed try: + if "tumour_id" in wildcards.keys(): # Get the appropriate capture space for this sample - return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed") + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed") + else: + this_bed = op.get_capture_space(CFG, wildcards.normal_id, wildcards.genome_build, wildcards.seq_type, "bed") + this_bed = reference_files(this_bed) except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample - return custom_bed if custom_bed else default_bed + this_bed = custom_bed if custom_bed else default_bed + return this_bed ##### RULES ##### diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk index 7ee497cb..e52fdcc8 100644 --- a/modules/manta/2.3/manta.smk +++ b/modules/manta/2.3/manta.smk @@ -87,16 +87,18 @@ rule _manta_index_bed: """) def _manta_get_capspace(wildcards): - + CFG = config["lcr-modules"]["manta"] # If this is a genome sample, return a BED file listing all chromosomes if wildcards.seq_type != "capture": - return str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") try: # Get the appropriate capture space for this sample - return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = reference_files(this_bed) except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample - return str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + return this_bed # Configures the manta workflow with the input BAM files and reference FASTA file. rule _manta_configure_paired: diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 0bfdc14e..48ad27f0 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -135,13 +135,13 @@ def _mutect2_get_interval_cli_arg( def _mutect_get_capspace(wildcards): - + CFG = config["lcr-modules"]["mutect2"] # If this isn't a capture sample, we don't have a capture space, so return nothing if wildcards.seq_type != "capture": return [] try: # Get the appropriate capture space for this sample - cap_space = get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") + cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") return " -L " + cap_space except NameError: # If we are using an older version of the reference workflow, we don't need to do anything diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk index cbb69a38..f274b8b0 100644 --- a/modules/sage/1.0/sage.smk +++ b/modules/sage/1.0/sage.smk @@ -122,16 +122,18 @@ def get_chromosomes(wildcards): return chromosomes def _sage_get_capspace(wildcards): - + CFG = config["lcr-modules"]["sage"] # If this is a genome sample, return a BED file listing all chromosomes if wildcards.seq_type != "capture": - return rules._download_sage_references.output.panel_bed + this_bed = rules._download_sage_references.output.panel_bed try: # Get the appropriate capture space for this sample - return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = reference_files(this_bed) except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample - return rules._download_sage_references.output.panel_bed + this_bed = rules._download_sage_references.output.panel_bed + return this_bed # Variant calling rule rule _run_sage: diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk index 439d17ae..db8b9465 100755 --- a/modules/strelka/1.1/strelka.smk +++ b/modules/strelka/1.1/strelka.smk @@ -120,16 +120,19 @@ def _strelka_get_indel_cli_arg(vcf_in = config["lcr-modules"]["strelka"]["inputs return _strelka_get_indel_cli_custom def _strelka_get_capspace(wildcards): - + CFG = config["lcr-modules"]["strelka"] # If this is a genome sample, return a BED file listing all chromosomes if wildcards.seq_type != "capture": - return str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") try: # Get the appropriate capture space for this sample - return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") + this_bed = reference_files(this_bed) except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample - return str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + + return this_bed rule _strelka_configure_paired: # Somatic From 511f4d4efe735cb48f19bdbcece0f58a606e9f9a Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 06:05:30 -0800 Subject: [PATCH 21/38] add new column to demo sample table --- demo/data/samples.tsv | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/demo/data/samples.tsv b/demo/data/samples.tsv index 158f6972..5e419226 100755 --- a/demo/data/samples.tsv +++ b/demo/data/samples.tsv @@ -1,6 +1,6 @@ -sample_id seq_type patient_id tissue_status genome_build strand read_length -TCRBOA7-N-WEX capture TCRBOA7 normal grch37 positive 100 -TCRBOA7-T-WEX capture TCRBOA7 tumour grch37 positive 100 -TCRBOA7-T-RNA mrna TCRBOA7 tumour grch37 positive 100 -TCRBOA7-N-WGS genome TCRBOA7 normal grch37 positive 100 -TCRBOA7-T-WGS genome TCRBOA7 tumour grch37 positive 100 +sample_id seq_type patient_id tissue_status genome_build strand read_length capture_space +TCRBOA7-N-WEX capture TCRBOA7 normal grch37 positive 100 none +TCRBOA7-T-WEX capture TCRBOA7 tumour grch37 positive 100 none +TCRBOA7-T-RNA mrna TCRBOA7 tumour grch37 positive 100 none +TCRBOA7-N-WGS genome TCRBOA7 normal grch37 positive 100 none +TCRBOA7-T-WGS genome TCRBOA7 tumour grch37 positive 100 none From 163a80b73536454beffa1698a90ce6edbe9341ad Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 09:10:20 -0800 Subject: [PATCH 22/38] add missing env files --- workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml | 1 + workflows/reference_files/2.4/envs/tabix-0.2.6.yaml | 1 + 2 files changed, 2 insertions(+) create mode 120000 workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml create mode 120000 workflows/reference_files/2.4/envs/tabix-0.2.6.yaml diff --git a/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml b/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml new file mode 120000 index 00000000..c185b12a --- /dev/null +++ b/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml @@ -0,0 +1 @@ +../../../../envs/bedtools/bedtools-2.29.2.yaml \ No newline at end of file diff --git a/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml b/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml new file mode 120000 index 00000000..bcf351f2 --- /dev/null +++ b/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml @@ -0,0 +1 @@ +../../../../envs/tabix/tabix-0.2.6.yaml \ No newline at end of file From b62b84b69a39b2f5b260c567a4a1d62898fb9050 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 14:30:27 -0800 Subject: [PATCH 23/38] revert erroneous error in mutect2 --- modules/mutect2/2.0/mutect2.smk | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 48ad27f0..603990ee 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -123,11 +123,12 @@ rule _mutect2_get_sm: # The first function loads the wildcard-containing file path and additional args from the config. # The second replaces wildcards with those used in the rule. def _mutect2_get_interval_cli_arg( - vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"] + vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], + interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"] ): def _mutect2_get_interval_cli_custom(wildcards, input): if vcf_in: - param = f"-L {input.candidate_positions}" + param = f"-L {input.candidate_positions} {interval_arg_in}" else: param = "" return param @@ -142,7 +143,8 @@ def _mutect_get_capspace(wildcards): try: # Get the appropriate capture space for this sample cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") - return " -L " + cap_space + cap_space = reference_files(cap_space) + return cap_space except NameError: # If we are using an older version of the reference workflow, we don't need to do anything return [] @@ -186,7 +188,7 @@ rule _mutect2_run_matched_unmatched: -I {input.tumour_bam} -I {input.normal_bam} -R {input.fasta} -normal "$(cat {input.normal_sm})" -O {output.vcf} --germline-resource {input.gnomad} - -L {wildcards.chrom} {params.interval_arg} {input.capture_arg} + -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg} -pon {input.pon} --f1r2-tar-gz {output.f1r2} > {log.stdout} 2> {log.stderr} """) From 9cb922c81d77696fbeb3289ff53ecef52414d01c Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 18:56:03 -0800 Subject: [PATCH 24/38] reorder if blocks --- modules/lofreq/1.1/lofreq.smk | 6 +++--- modules/manta/2.3/manta.smk | 6 +++--- modules/mutect2/2.0/mutect2.smk | 13 +++++++------ modules/sage/1.0/sage.smk | 6 +++--- modules/strelka/1.1/strelka.smk | 6 +++--- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index 8f120b93..895cc662 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -65,9 +65,6 @@ def _lofreq_get_capspace(wildcards): custom_bed = CFG['switches']['regions_bed'][wildcards.seq_type] else: custom_bed = default_bed - # If this is a genome sample, return a BED file listing all chromosomes - if wildcards.seq_type != "capture": - return custom_bed if custom_bed else default_bed try: if "tumour_id" in wildcards.keys(): # Get the appropriate capture space for this sample @@ -78,6 +75,9 @@ def _lofreq_get_capspace(wildcards): except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample this_bed = custom_bed if custom_bed else default_bed + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + return custom_bed if custom_bed else default_bed return this_bed ##### RULES ##### diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk index e52fdcc8..7a009701 100644 --- a/modules/manta/2.3/manta.smk +++ b/modules/manta/2.3/manta.smk @@ -88,9 +88,6 @@ rule _manta_index_bed: def _manta_get_capspace(wildcards): CFG = config["lcr-modules"]["manta"] - # If this is a genome sample, return a BED file listing all chromosomes - if wildcards.seq_type != "capture": - this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") try: # Get the appropriate capture space for this sample this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") @@ -98,6 +95,9 @@ def _manta_get_capspace(wildcards): except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") return this_bed # Configures the manta workflow with the input BAM files and reference FASTA file. diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 603990ee..d5fff996 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -137,17 +137,18 @@ def _mutect2_get_interval_cli_arg( def _mutect_get_capspace(wildcards): CFG = config["lcr-modules"]["mutect2"] - # If this isn't a capture sample, we don't have a capture space, so return nothing - if wildcards.seq_type != "capture": - return [] try: # Get the appropriate capture space for this sample cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") cap_space = reference_files(cap_space) - return cap_space + this_space = cap_space + # If we are using an older version of the reference workflow, we don't need to do anything except NameError: - # If we are using an older version of the reference workflow, we don't need to do anything - return [] + this_space = [] + # If this isn't a capture sample, we don't have a capture space, so return nothing + if wildcards.seq_type != "capture": + this_space = [] + return this_space # Launces Mutect2 in matched and unmatched mode diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk index f274b8b0..3b101096 100644 --- a/modules/sage/1.0/sage.smk +++ b/modules/sage/1.0/sage.smk @@ -123,9 +123,6 @@ def get_chromosomes(wildcards): def _sage_get_capspace(wildcards): CFG = config["lcr-modules"]["sage"] - # If this is a genome sample, return a BED file listing all chromosomes - if wildcards.seq_type != "capture": - this_bed = rules._download_sage_references.output.panel_bed try: # Get the appropriate capture space for this sample this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") @@ -133,6 +130,9 @@ def _sage_get_capspace(wildcards): except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample this_bed = rules._download_sage_references.output.panel_bed + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + this_bed = rules._download_sage_references.output.panel_bed return this_bed # Variant calling rule diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk index db8b9465..c3cbf117 100755 --- a/modules/strelka/1.1/strelka.smk +++ b/modules/strelka/1.1/strelka.smk @@ -121,9 +121,6 @@ def _strelka_get_indel_cli_arg(vcf_in = config["lcr-modules"]["strelka"]["inputs def _strelka_get_capspace(wildcards): CFG = config["lcr-modules"]["strelka"] - # If this is a genome sample, return a BED file listing all chromosomes - if wildcards.seq_type != "capture": - this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") try: # Get the appropriate capture space for this sample this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz") @@ -131,6 +128,9 @@ def _strelka_get_capspace(wildcards): except NameError: # If we are using an older version of the reference workflow, use the same region file as the genome sample this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") + # If this is a genome sample, return a BED file listing all chromosomes + if wildcards.seq_type != "capture": + this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz") return this_bed From a22a1b87773ba600fb37b71ae1c7e9e883d07697 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Tue, 11 Jan 2022 20:38:23 -0800 Subject: [PATCH 25/38] remove temp fix --- modules/starfish/2.0/starfish.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/starfish/2.0/starfish.smk b/modules/starfish/2.0/starfish.smk index f63d6988..d3213e5a 100644 --- a/modules/starfish/2.0/starfish.smk +++ b/modules/starfish/2.0/starfish.smk @@ -112,7 +112,7 @@ rule _starfish_run: CFG["dirs"]["inputs"] + "vcf/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{caller}.vcf.gz.tbi", caller = callers ), - reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build in ["hg38-panea", "hg38-nci"] else ancient(reference_files("genomes/hg38/sdf")), + reference = ancient(reference_files("genomes/{genome_build}/sdf")), starfish_script = CFG["inputs"]["starfish_script"] output: complete = touch(CFG["dirs"]["starfish"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/starfish.complete"), From d5899ef638fd1d10e7e6e8ebf610ba5e26df8ee3 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 12 Jan 2022 22:07:40 -0800 Subject: [PATCH 26/38] bump oncopipe version --- oncopipe/oncopipe/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oncopipe/oncopipe/__version__.py b/oncopipe/oncopipe/__version__.py index a5673be8..8c861b0f 100644 --- a/oncopipe/oncopipe/__version__.py +++ b/oncopipe/oncopipe/__version__.py @@ -6,7 +6,7 @@ __title__ = "oncopipe" __description__ = "Functions for running Snakemake modules" -__version__ = "1.0.11" +__version__ = "1.0.12" __author__ = "Bruno Grande" __author_email__ = "bgrande@sfu.ca" __license__ = "MIT" From 68fefd12d60d3209b7067a37107c922939ff855d Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 12 Jan 2022 22:09:54 -0800 Subject: [PATCH 27/38] expose oncopipe version --- oncopipe/oncopipe/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/oncopipe/oncopipe/__init__.py b/oncopipe/oncopipe/__init__.py index fe3d828f..c6360904 100755 --- a/oncopipe/oncopipe/__init__.py +++ b/oncopipe/oncopipe/__init__.py @@ -9,6 +9,7 @@ import collections.abc from datetime import datetime from collections import defaultdict, namedtuple +from .__version__ import __version__ import yaml import pandas as pd From 57c66b3747bd24f1bdb45da5c686bba2c193e00d Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 12 Jan 2022 22:12:56 -0800 Subject: [PATCH 28/38] increase min oncopipe version in slms3 submodules --- modules/lofreq/1.1/lofreq.smk | 2 +- modules/manta/2.3/manta.smk | 2 +- modules/mutect2/2.0/mutect2.smk | 2 +- modules/sage/1.0/sage.smk | 2 +- modules/strelka/1.1/strelka.smk | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk index 895cc662..4a8e57c2 100644 --- a/modules/lofreq/1.1/lofreq.smk +++ b/modules/lofreq/1.1/lofreq.smk @@ -15,7 +15,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk index 7a009701..3a84e077 100644 --- a/modules/manta/2.3/manta.smk +++ b/modules/manta/2.3/manta.smk @@ -15,7 +15,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index d5fff996..10c00294 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -17,7 +17,7 @@ import oncopipe as op import inspect # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk index 3b101096..d48343b7 100644 --- a/modules/sage/1.0/sage.smk +++ b/modules/sage/1.0/sage.smk @@ -16,7 +16,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk index c3cbf117..089fa085 100755 --- a/modules/strelka/1.1/strelka.smk +++ b/modules/strelka/1.1/strelka.smk @@ -16,7 +16,7 @@ import oncopipe as op # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe -min_oncopipe_version="1.0.11" +min_oncopipe_version="1.0.12" import pkg_resources try: from packaging import version From 5af516ab62d8470928e3b21322cb761db9819db8 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 12 Jan 2022 22:16:55 -0800 Subject: [PATCH 29/38] harmonize mutect2 no_normal rule --- modules/mutect2/2.0/mutect2.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 10c00294..78fac489 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -230,7 +230,7 @@ rule _mutect2_run_no_normal: gatk Mutect2 --java-options "-Xmx{params.mem_mb}m" {params.opts} -I {input.tumour_bam} -R {input.fasta} -O {output.vcf} --germline-resource {input.gnomad} - -L {wildcards.chrom} {params.interval_arg} {input.capture_arg} + -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg} -pon {input.pon} --f1r2-tar-gz {output.f1r2} > {log.stdout} 2> {log.stderr} """) From 7bfe4f6c5346fe7bbaa63efada27709f854c6ab0 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Mon, 17 Jan 2022 19:57:34 -0800 Subject: [PATCH 30/38] make capture_space column not required --- oncopipe/oncopipe/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/oncopipe/oncopipe/__init__.py b/oncopipe/oncopipe/__init__.py index 83ed041d..f4afd253 100755 --- a/oncopipe/oncopipe/__init__.py +++ b/oncopipe/oncopipe/__init__.py @@ -1837,7 +1837,11 @@ def get_capture_space(module_config, sample_id, genome_build, seq_type, return_e if len(this_sample) != 1: raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type)) - panel = this_sample.iloc[0]['capture_space'] + + if "capture_space" in this_sample.columns: + panel = this_sample.iloc[0]['capture_space'] + else: + panel = "none" # If this panel is "none" (aka not specified) use the default for this reference genome if panel.upper() in (name.upper() for name in ['none', "na", "n/a", ""]): From 3a3ddc3850eb8757cf334794e1c5939a9e697efe Mon Sep 17 00:00:00 2001 From: Kdreval Date: Mon, 17 Jan 2022 19:58:57 -0800 Subject: [PATCH 31/38] revert sample table --- demo/data/samples.tsv | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/demo/data/samples.tsv b/demo/data/samples.tsv index 5e419226..158f6972 100755 --- a/demo/data/samples.tsv +++ b/demo/data/samples.tsv @@ -1,6 +1,6 @@ -sample_id seq_type patient_id tissue_status genome_build strand read_length capture_space -TCRBOA7-N-WEX capture TCRBOA7 normal grch37 positive 100 none -TCRBOA7-T-WEX capture TCRBOA7 tumour grch37 positive 100 none -TCRBOA7-T-RNA mrna TCRBOA7 tumour grch37 positive 100 none -TCRBOA7-N-WGS genome TCRBOA7 normal grch37 positive 100 none -TCRBOA7-T-WGS genome TCRBOA7 tumour grch37 positive 100 none +sample_id seq_type patient_id tissue_status genome_build strand read_length +TCRBOA7-N-WEX capture TCRBOA7 normal grch37 positive 100 +TCRBOA7-T-WEX capture TCRBOA7 tumour grch37 positive 100 +TCRBOA7-T-RNA mrna TCRBOA7 tumour grch37 positive 100 +TCRBOA7-N-WGS genome TCRBOA7 normal grch37 positive 100 +TCRBOA7-T-WGS genome TCRBOA7 tumour grch37 positive 100 From bbe0c4b521504d7902a3368f0232d85b7e6f3da6 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 19 Jan 2022 13:40:46 -0800 Subject: [PATCH 32/38] mutect2 run on chrs with candidate positions only --- modules/mutect2/2.0/mutect2.smk | 58 +++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 78fac489..4377bf4a 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -15,6 +15,7 @@ # Import package with useful functions for developing analysis modules import oncopipe as op import inspect +import pandas as pd # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe min_oncopipe_version="1.0.12" @@ -74,7 +75,20 @@ localrules: ##### RULES ##### - +def _mutect_get_capspace(wildcards): + CFG = config["lcr-modules"]["mutect2"] + try: + # Get the appropriate capture space for this sample + cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") + cap_space = reference_files(cap_space) + this_space = cap_space + # If we are using an older version of the reference workflow, we don't need to do anything + except NameError: + this_space = [] + # If this isn't a capture sample, we don't have a capture space, so return nothing + if wildcards.seq_type != "capture": + this_space = [] + return this_space # Symlinks the input files into the module results directory (under '00-inputs/') rule _mutect2_input_bam: @@ -97,11 +111,31 @@ rule _mutect2_dummy_positions: # Symlink chromosomes used for parallelization checkpoint _mutect2_input_chrs: input: - chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt") + candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output), + chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"), + capture_arg = _mutect_get_capspace output: - chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt" + chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt" run: - op.absolute_symlink(input.chrs, output.chrs) + # obtain list of main chromosomes + main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None) + main_chrs = main_chrs.iloc[:, 0].astype(str).unique().tolist() + #obtain list of chromosomes in candidate positions + candidate_chrs = pd.read_csv(input.candidate_positions, comment='#', sep='\t') + candidate_chrs = candidate_chrs.iloc[:, 0].astype(str).unique().tolist() + # obtain list of chromosomes in the capture space + interval_chrs = pd.read_csv(input.capture_arg, comment='@', sep='\t') + interval_chrs = interval_chrs.iloc[:, 0].astype(str).unique().tolist() + # as there is no capture space for genomes and to handle chr prefixed/non prefixed cases consistently, + # return this list as main chromosomes + if not interval_chrs: # if list is empty + interval_chrs = main_chrs + # intersect the three lists to obtain chromosomes present in all + intersect_chrs = list(set(main_chrs) & set(candidate_chrs) & set(interval_chrs)) + # convert list to single-column df + intersect_chrs = pd.DataFrame(intersect_chrs).sort_values(0) + # write out the file with mutated chromosomes + intersect_chrs.to_csv(output.chrs, index=False, header=False) # Retrieves from SM tag from BAM and writes to file @@ -135,22 +169,6 @@ def _mutect2_get_interval_cli_arg( return _mutect2_get_interval_cli_custom -def _mutect_get_capspace(wildcards): - CFG = config["lcr-modules"]["mutect2"] - try: - # Get the appropriate capture space for this sample - cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list") - cap_space = reference_files(cap_space) - this_space = cap_space - # If we are using an older version of the reference workflow, we don't need to do anything - except NameError: - this_space = [] - # If this isn't a capture sample, we don't have a capture space, so return nothing - if wildcards.seq_type != "capture": - this_space = [] - return this_space - - # Launces Mutect2 in matched and unmatched mode rule _mutect2_run_matched_unmatched: input: From 869302f8c98701a7ac5e67f0c61521f1faa92da2 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 19 Jan 2022 19:54:10 -0800 Subject: [PATCH 33/38] ensure genome compatibility of new updates --- modules/mutect2/2.0/mutect2.smk | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index 4377bf4a..cdf0c5af 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -84,10 +84,10 @@ def _mutect_get_capspace(wildcards): this_space = cap_space # If we are using an older version of the reference workflow, we don't need to do anything except NameError: - this_space = [] + this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed") # If this isn't a capture sample, we don't have a capture space, so return nothing if wildcards.seq_type != "capture": - this_space = [] + this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed") return this_space # Symlinks the input files into the module results directory (under '00-inputs/') @@ -126,10 +126,6 @@ checkpoint _mutect2_input_chrs: # obtain list of chromosomes in the capture space interval_chrs = pd.read_csv(input.capture_arg, comment='@', sep='\t') interval_chrs = interval_chrs.iloc[:, 0].astype(str).unique().tolist() - # as there is no capture space for genomes and to handle chr prefixed/non prefixed cases consistently, - # return this list as main chromosomes - if not interval_chrs: # if list is empty - interval_chrs = main_chrs # intersect the three lists to obtain chromosomes present in all intersect_chrs = list(set(main_chrs) & set(candidate_chrs) & set(interval_chrs)) # convert list to single-column df From 260d53d6cf2fc7fb78e975f50fa7d64e60edfe57 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 19 Jan 2022 19:55:53 -0800 Subject: [PATCH 34/38] allow to specify local bed file for capture --- workflows/reference_files/2.4/reference_files.smk | 8 ++++++-- .../reference_files/2.4/reference_files_header.smk | 10 ++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk index 7962f0d2..f3e3a831 100644 --- a/workflows/reference_files/2.4/reference_files.smk +++ b/workflows/reference_files/2.4/reference_files.smk @@ -474,6 +474,7 @@ rule sort_and_pad_capspace: capture_bed = rules.get_capspace_bed_download.output.capture_bed, fai = rules.index_genome_fasta.output.fai output: + intermediate_bed = temp("genomes/{genome_build}/capture_space/{capture_space}.intermediate.bed"), padded_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed" params: padding_size = config['capture_params']['padding_size'] # Default to 200. I would be warry of changing @@ -481,8 +482,11 @@ rule sort_and_pad_capspace: "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.log" conda: CONDA_ENVS["bedtools"] shell: - "bedtools slop -b {params.padding_size} -i {input.capture_bed} -g {input.fai} | bedtools sort | bedtools merge > {output.padded_bed} 2> {log}" - + op.as_one_line(""" + cat {input.fai} | cut -f 1-2 | perl -ane 'print "$F[0]\\t0\\t$F[1]\\n"' | bedtools intersect -wa -a {input.capture_bed} -b stdin > {output.intermediate_bed} + && + bedtools slop -b {params.padding_size} -i {output.intermediate_bed} -g {input.fai} | bedtools sort | bedtools merge > {output.padded_bed} 2> {log} + """) rule check_capspace_contigs: input: diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index 9c1581d3..20479bd6 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -530,10 +530,16 @@ rule download_capspace_bed: log: "downloads/capture_space/{capture_space}.{genome_build}.bed.log" params: - url = lambda w: config["capture_space"][w.capture_space]["capture_bed_url"], + path = lambda w: config["capture_space"][w.capture_space]["capture_bed_file"] if "capture_bed_file" in config["capture_space"][w.capture_space] else config["capture_space"][w.capture_space]["capture_bed_url"], provider = lambda w: config["capture_space"][w.capture_space]["provider"] shell: - "curl -L {params.url} > {output.capture_bed} 2> {log}" + op.as_one_line(""" + if [ -e {params.path} ]; then + cat {params.path} > {output.capture_bed} 2> {log}; + else + curl -L {params.path} > {output.capture_bed} 2> {log}; + fi + """) rule add_remove_chr_prefix_bed: input: From d8cc389b934f8f0c17d001ed19cecc611f2d8c53 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Wed, 19 Jan 2022 20:05:01 -0800 Subject: [PATCH 35/38] add instructions on adding custom capture space --- workflows/reference_files/2.4/config/default.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml index a3232434..1ad75c50 100644 --- a/workflows/reference_files/2.4/config/default.yaml +++ b/workflows/reference_files/2.4/config/default.yaml @@ -74,6 +74,12 @@ capture_space: provider: "ensembl" default: "true" capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch37_all_genes.canonical.sort.bed4" + # to add custom capture space, follow this example and fill in dictionary keys + #name_of_capture_panel: + #genome: "grch37" or "hg38" as example + #provider: "ensembl" or "ucsc" + #capture_bed_url: here provide a link enclosed in "" to download bed file for capture panel if it is available in internet + #capture_bed_file: this key is optional and in "" you can specify path to a local bed file with capture panel capture_params: padding_size: "200" From cfdcf96847c172259c7b6198329d5fd4d800565d Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 21 Jan 2022 10:52:36 -0800 Subject: [PATCH 36/38] revert name of checkpoint output file --- modules/mutect2/2.0/mutect2.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index cdf0c5af..f939c0ef 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -115,7 +115,7 @@ checkpoint _mutect2_input_chrs: chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"), capture_arg = _mutect_get_capspace output: - chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt" + chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt" run: # obtain list of main chromosomes main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None) From 689b5ee42887f4e044bd8c183c576c34ad90e297 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 21 Jan 2022 12:56:06 -0800 Subject: [PATCH 37/38] revert the revert back in mutect2 checkpoint --- modules/mutect2/2.0/mutect2.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk index f939c0ef..cdf0c5af 100644 --- a/modules/mutect2/2.0/mutect2.smk +++ b/modules/mutect2/2.0/mutect2.smk @@ -115,7 +115,7 @@ checkpoint _mutect2_input_chrs: chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"), capture_arg = _mutect_get_capspace output: - chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt" + chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt" run: # obtain list of main chromosomes main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None) From 1a2b62e5987b02dc13df415b18801214bca2fd12 Mon Sep 17 00:00:00 2001 From: Kdreval Date: Fri, 21 Jan 2022 12:57:09 -0800 Subject: [PATCH 38/38] allow using local fasta files in reference_files --- .../2.4/reference_files_header.smk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk index 20479bd6..d2326396 100644 --- a/workflows/reference_files/2.4/reference_files_header.smk +++ b/workflows/reference_files/2.4/reference_files_header.smk @@ -173,16 +173,22 @@ for chrom_map_file in CHROM_MAPPINGS_FILES: rule download_genome_fasta: - output: + output: fasta = "downloads/genome_fasta/{genome_build}.fa" - log: + log: "downloads/genome_fasta/{genome_build}.fa.log" wildcard_constraints: genome_build = ".+(? {output.fasta} 2> {log}" + op.as_one_line(""" + if [ -e {params.path} ]; then + cat {params.path} > {output.fasta} 2> {log}; + else + curl -L {params.path} > {output.fasta} 2> {log}; + fi + """) rule download_masked_genome_fasta: output: