From ecaafe5980e50025131d599996a2a3106ea69720 Mon Sep 17 00:00:00 2001
From: ckrushton <ckrushto@sfu.ca>
Date: Thu, 2 Sep 2021 13:24:32 -0700
Subject: [PATCH 01/38] Added support for capture space in reference workflow
 and slms-3

- Added a capture_space section to the reference config. This lists the name, download URL, and genome build version for that capture space
- Every capture space is padded, sorted, merged, and the contig names updated for every genome_build of that version (ex. GRCh38, hg38)
- Modified strelka, mutect, manta, sage, and lofreq to obtain the capture space if necessary. If the sample is not "capture", the original implemetation is used
---
 modules/lofreq/1.1/lofreq.smk                 |  25 ++-
 modules/manta/2.3/manta.smk                   |  17 +-
 modules/mutect2/2.0/mutect2.smk               |  15 +-
 modules/sage/1.0/sage.smk                     |  20 ++-
 modules/strelka/1.1/strelka.smk               |  16 +-
 .../reference_files/2.4/config/default.yaml   |  25 +++
 .../reference_files/2.4/reference_files.smk   | 158 +++++++++++++++++-
 .../2.4/reference_files_header.smk            |  74 ++++++++
 8 files changed, 332 insertions(+), 18 deletions(-)

diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
index 471590e5..01be0e37 100644
--- a/modules/lofreq/1.1/lofreq.smk
+++ b/modules/lofreq/1.1/lofreq.smk
@@ -47,8 +47,6 @@ SCRIPT_PATH = CFG['inputs']['src_dir']
 bed = str(reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed"))
 CFG['switches']['regions_bed']['_default'] = bed
 
-
-
 # Define rules to be run locally when using a compute cluster
 localrules:
     _lofreq_input_bam,
@@ -56,6 +54,17 @@ localrules:
     _lofreq_all,
     _lofreq_link_to_preprocessed
 
+def _lofreq_get_capspace(sample_id, wildcards):
+
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        return CFG["switches"]["regions_bed"]
+    try:
+        # Get the appropriate capture space for this sample
+        return get_capture_space(sample_id, wildcards.genome_build, wildcards.seq_type, "bed")
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample 
+        return CFG["switches"]["regions_bed"]
 
 ##### RULES #####
 
@@ -83,7 +92,6 @@ rule _lofreq_preprocess_normal:
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
-        bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
     output:
         out_dir = directory(CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/"),
         preprocessing_start = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/preprocessing.started",
@@ -97,7 +105,8 @@ rule _lofreq_preprocess_normal:
         stdout = CFG["logs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/lofreq_pre.stdout.log",
         stderr = CFG["logs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/lofreq_pre.stderr.log"
     params:
-        opts = CFG["options"]["lofreq"]
+        opts = CFG["options"]["lofreq"],
+        bed = lambda w: _lofreq_get_capspace(w.normal_id, w)
     conda:
         CFG["conda_envs"]["lofreq"]
     threads:
@@ -114,7 +123,7 @@ rule _lofreq_preprocess_normal:
             touch {output.preprocessing_start}
             && 
             lofreq somatic --normal_only {params.opts} --threads {threads} -t {input.normal_bam} -n {input.normal_bam}
-            -f {input.fasta} -o {output.out_dir}/ -d {input.dbsnp} --bed {input.bed}
+            -f {input.fasta} -o {output.out_dir}/ -d {input.dbsnp} --bed {params.bed}
             > {log.stdout} 2> {log.stderr} && 
             touch {output.preprocessing_complete};
         else echo "WARNING: PATH is not set properly, using $(which lofreq2_call_pparallel.py)"; fi
@@ -162,7 +171,6 @@ rule _lofreq_run_tumour:
         vcf_relaxed = rules._lofreq_link_to_preprocessed.output.vcf_relaxed,
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
-        bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
     output:
         vcf_snvs_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz",
         vcf_indels_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.indels.vcf.gz",
@@ -172,7 +180,8 @@ rule _lofreq_run_tumour:
         stdout = CFG["logs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/lofreq.stdout.log",
         stderr = CFG["logs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/lofreq.stderr.log"
     params:
-        opts = CFG["options"]["lofreq"]
+        opts = CFG["options"]["lofreq"],
+        bed = lambda w: _lofreq_get_capspace(w.tumour_id, w)
     conda:
         CFG["conda_envs"]["lofreq"]
     threads:
@@ -187,7 +196,7 @@ rule _lofreq_run_tumour:
         if [[ $(which lofreq2_call_pparallel.py) =~ $SCRIPT ]]; then 
             echo "using bundled patched script $SCRIPT";
             lofreq somatic --continue {params.opts} --threads {threads} -t {input.tumour_bam} -n {input.normal_bam}
-            -f {input.fasta} -o $(dirname {output.vcf_snvs_filtered})/ -d {input.dbsnp} --bed {input.bed}
+            -f {input.fasta} -o $(dirname {output.vcf_snvs_filtered})/ -d {input.dbsnp} --bed {params.bed}
             > {log.stdout} 2> {log.stderr};
         else echo "WARNING: PATH is not set properly, using $(which lofreq2_call_pparallel.py)"; fi
         """)
diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk
index e1ff97f6..562489fd 100644
--- a/modules/manta/2.3/manta.smk
+++ b/modules/manta/2.3/manta.smk
@@ -83,7 +83,8 @@ rule _manta_configure_paired:
         stderr = CFG["logs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/manta_configure.stderr.log"
     params:
         opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]),
-        tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"])
+        tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"]),
+        bedz = lambda w:_manta_get_capspace(w)
     wildcard_constraints:
         pair_status = "matched|unmatched"
     conda:
@@ -110,7 +111,8 @@ rule _manta_configure_unpaired:
         stderr = CFG["logs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/manta_configure.stderr.log"
     params:
         opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]),
-        tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"])
+        tumour_bam_arg_name = op.switch_on_wildcard("seq_type", CFG["switches"]["tumour_bam_arg_name"]),
+        bedz = lambda w: _manta_get_capspace(w)
     wildcard_constraints:
         pair_status = "no_normal"
     conda:
@@ -272,6 +274,17 @@ def _manta_predict_output(wildcards):
 
     return outputs_with_bedpe + outputs_without_bedpe
 
+def _manta_get_capspace(wildcards):
+
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz")
+    try:
+        # Get the appropriate capture space for this sample
+        return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample
+        return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz")
 
 # Generates the target symlinks for each run depending on the Manta output VCF files
 rule _manta_dispatch:
diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 9d43eb36..3bf5de7e 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -137,7 +137,7 @@ rule _mutect2_run_matched_unmatched:
     params:
         mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), 
         opts = CFG["options"]["mutect2_run"], 
-        interval_arg = _mutect2_get_interval_cli_arg()
+        interval_arg = lambda w: _mutect_get_capspace(w)
     conda:
         CFG["conda_envs"]["gatk"]
     threads:
@@ -178,7 +178,7 @@ rule _mutect2_run_no_normal:
     params:
         mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8),
         opts = CFG["options"]["mutect2_run"], 
-        interval_arg = _mutect2_get_interval_cli_arg 
+        interval_arg = lambda w: _mutect_get_capspace(w) 
     conda:
         CFG["conda_envs"]["gatk"]
     threads:
@@ -207,6 +207,17 @@ def _mutect2_get_chr_vcfs(wildcards):
     )
     return(vcfs)
 
+def _mutect_get_capspace(wildcards):
+
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        return _mutect2_get_interval_cli_args()
+    try:
+        # Get the appropriate capture space for this sample
+        return " -L " + get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample
+        return _mutect2_get_interval_cli_args()
 
 def _mutect2_get_chr_tbis(wildcards):
     CFG = config["lcr-modules"]["mutect2"]
diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk
index 2204cda7..ee088041 100644
--- a/modules/sage/1.0/sage.smk
+++ b/modules/sage/1.0/sage.smk
@@ -119,14 +119,25 @@ def get_chromosomes(wildcards):
     chromosomes= ",".join(chromosomes)    
     return chromosomes
 
+def _sage_get_capspace(wildcards):
+
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        return rules._download_sage_references.output.panel_bed
+    try:
+        # Get the appropriate capture space for this sample
+        return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample
+        return rules._download_sage_references.output.panel_bed
+
 # Variant calling rule
 rule _run_sage:
     input:
         tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = str(rules._input_references.output.genome_fa),
-        hotspots = str(rules._download_sage_references.output.hotspots),
-        panel_bed = str(rules._download_sage_references.output.panel_bed),
+        hotspots = rules._download_sage_references.output.hotspots,
         high_conf_bed = str(rules._download_sage_references.output.high_conf_bed)
     output:
         vcf = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf"),
@@ -139,7 +150,8 @@ rule _run_sage:
         chromosomes = get_chromosomes,
         assembly = lambda w: "hg38" if "38" in str({w.genome_build}) else "hg19",
         sage= "$(dirname $(readlink -e $(which SAGE)))/sage.jar",
-        jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8)
+        jvmheap = lambda wildcards, resources: int(resources.mem_mb * 0.8),
+        panel_bed = lambda w: _sage_get_capspace(w)
     conda:
         CFG["conda_envs"]["sage"]
     threads:
@@ -162,7 +174,7 @@ rule _run_sage:
         -assembly {params.assembly}
         -ref_genome {input.fasta}
         -hotspots {input.hotspots}
-        -panel_bed {input.panel_bed}
+        -panel_bed {params.panel_bed}
         -high_confidence_bed {input.high_conf_bed}
         -out {output.vcf}
         >>  {log.stdout} 2>> {log.stderr}
diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk
index d7beac0a..fe8c429f 100755
--- a/modules/strelka/1.1/strelka.smk
+++ b/modules/strelka/1.1/strelka.smk
@@ -118,6 +118,7 @@ rule _strelka_configure_paired: # Somatic
     params:
         indel_arg = _strelka_get_indel_cli_arg(),
         opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]),
+        bedz = lambda w: _strelka_get_capspace(w)
     wildcard_constraints:
         pair_status = "matched|unmatched"
     conda:
@@ -147,7 +148,8 @@ rule _strelka_configure_unpaired: # germline
         stdout = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_configure.stdout.log",
         stderr = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_configure.stderr.log"
     params:
-        opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"])
+        opts = op.switch_on_wildcard("seq_type", CFG["options"]["configure"]),
+        bedz = lambda w: _strelka_get_capspace(w)
     message:
         "WARNING: {wildcards.seq_type} sample ({wildcards.tumour_id}) is being processed using Strelka Germline workflow. Ensure pairing config for capture is set to run_unpaired_tumours_with: 'unmatched_normal' to run Strelka Somatic workflow"
     wildcard_constraints:
@@ -256,6 +258,18 @@ def _strelka_get_output(wildcards):
         vcf = str(rules._strelka_filter_combine.output.vcf)
     return vcf
 
+def _strelka_get_capspace(wildcards):
+
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz")
+    try:
+        # Get the appropriate capture space for this sample
+        return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample
+        return reference_files("genomes/" + wildcards.genome_build + "/genome_fasta/main_chromosomes.bed.gz")
+
 # Symlinks the final output files into the module results directory (under '99-outputs/'). Links will always use "combined" in the name (dropping odd naming convention used by Strelka in unpaired mode)
 rule _strelka_output_filtered_vcf:
     input:
diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml
index 589bd908..7e6a5d26 100644
--- a/workflows/reference_files/2.4/config/default.yaml
+++ b/workflows/reference_files/2.4/config/default.yaml
@@ -27,6 +27,31 @@ genome_builds:
         version: "grch37"
         provider: "ensembl"
         genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hs37d5.fa"
+    hg19-reddy:
+        # Version of hg19 with chrM at the start, and chrM with a length of 16569bp. Used for the Reddy dataset
+        version: "grch37"
+        provider: "ucsc"
+        genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg19-reddy.fa"
+    grch38-nci:
+        # NCI's version of GRCh38, with a costco-sized flat pack of decoys
+        version: "grch38"
+        provider: "ucsc"
+        genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/grch38-nci.fa"
+
+capture_space:
+    exome-utr-grch38:
+        genome: "grch38"
+        provider: "ucsc"
+        default: "true"
+        capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch38_all_genes.canonical.sort.bed4"
+    exome-utr-grch37:
+        genome: "grch37"
+        provider: "ensembl"
+        default: "true"
+        capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch37_all_genes.canonical.sort.bed4"
+
+capture_params:
+    padding_size: "200"
 
 wildcard_values:
     gencode_release: ["33"]
diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk
index fe4a1d44..1745fb96 100644
--- a/workflows/reference_files/2.4/reference_files.smk
+++ b/workflows/reference_files/2.4/reference_files.smk
@@ -97,7 +97,7 @@ rule get_sdf_refs:
     output: 
         sdf = directory("genomes/{genome_build}/sdf")
     wildcard_constraints: 
-        genome_build = "hg38|hg19|grch37|hs37d5"
+        genome_build = "hg38|hg19|grch37|hs37d5|hg19-reddy"
     shell: 
         "ln -srfT {input.sdf} {output.sdf}"
 
@@ -381,3 +381,159 @@ rule get_mutect2_small_exac:
             &&
         tabix {output.vcf}
         """)
+
+
+### FOR HANDLING CAPTURE SPACE/TARGETED SEQUENCING DATA ###
+# Added by Chris
+# What this *should* do (if I have written these rules correctly) is obtain a capture space BED file
+# check the contig names against the reference (and replace them as necessary), pad, sort,merge, and generate a bgzip/tabix
+# indexed version of the BED as well as an interval list.
+
+def _check_capspace_provider(w):
+    # Checks to determine if both the capture space BED and associated reference genome
+    # are chr prefixed or not
+
+    # If this is specified as the "default", make sure we obtain the relevent capture space
+    default_key = "default-" + w.genome_build
+    if default_key not in config["capture_space"] and w.capture_space == default_key:
+        # aka if the user hasn't explicitly specified a default using a default name
+        capture_space = _get_default_capspace(w)
+    else:
+        capture_space = w.capture_space
+
+    genome_provider = config["genome_builds"][w.genome_build]["provider"]
+    bed_provider = config["capture_space"][capture_space]["provider"]
+    
+    # If the providers match (i.e. they share the same prefix), just use the downloaded version
+    if genome_provider == bed_provider:
+        return {'bed': expand(rules.download_capspace_bed.output.capture_bed, capture_space=capture_space, genome_build=w.genome_build)}
+    else:
+        # Prompt the prefix to be converted
+        chr_status = "chr" if genome_provider == "ucsc" else "no_chr"
+        return {'bed': expand(rules.add_remove_chr_prefix_bed.output.converted_bed, capture_space = capture_space, genome_build = w.genome_build, chr_status= chr_status)}
+
+
+def get_capture_space(sample_id, genome_build, seq_type, return_ext):
+    """
+    Obtain the corresponding capture space file (gz, interval list etc) for the specified sample
+    """
+
+    # Get the corresponding capture space for a given sample
+    sample_table = CFG["samples"]
+    sample = sample_table.loc[(sample_table['sample_id'] == sample_id) &
+            (sample_table['genome_build'] == genome_build) &
+            (sample_table['seq_type'] == seq_type)]
+    if len(sample) != 1:
+        raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type))
+    panel = sample.iloc[0]['capture_space']
+
+    # If this panel is "none" (aka not specified) use the default for this reference genome
+    if panel == "none":
+        # Get the provider for this version of the reference genome
+        genome_version = None
+        for version, builds in GENOME_VERSION_GROUPS.items():
+            if genome_build in builds:
+                assert genome_version is None  # Failsafe, but this should never happen, as the reference workflow will not tolerate duplicates (I think)
+                genome_version = version
+        if genome_version is None:
+            raise AttributeError("Unable to find the corresponding genome version (ex. GRCh37) for build \'%s\'" % genome_build)
+        
+        try:
+            panel = DEFAULT_CAPSPACE[genome_version]
+        except KeyError as e:
+            raise AttributeError("No default capture space was specified for genome version \'%s\'. You can specify a default by setting \'default=\'true\'\' in a \'%s\'-based capture space in the reference config" % (genome_version, genome_version)) from e
+
+    # Now that we have found the corresponding capture region for this sample, obtain the requested file
+    return reference_files("genomes/" + genome_build + "/capture_space/" + panel + ".padded." + return_ext)
+
+
+rule get_capspace_bed_download:
+    input:
+        unpack(_check_capspace_provider)
+    output:
+        capture_bed = "genomes/{genome_build}/capture_space/{capture_space}.bed"
+    conda: CONDA_ENVS["coreutils"]
+    shell:
+        "ln -srf {input.bed} {output.capture_bed}"
+
+
+rule sort_and_pad_capspace:
+    input:
+       capture_bed = rules.get_capspace_bed_download.output.capture_bed,
+       fai = rules.index_genome_fasta.output.fai
+    output:
+        padded_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed"
+    params:
+        padding_size = config['capture_params']['padding_size']  # Default to 200. I would be warry of changing
+    log:
+        "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.log"
+    conda: CONDA_ENVS["bedtools"]
+    shell:
+        "bedtools slop -b {params.padding_size} -i {input.capture_bed} -g {input.fai} | bedtools sort | bedtools merge > {output.padded_bed} 2> {log}"
+
+
+rule check_capspace_contigs:
+    input:
+        bed = rules.get_capspace_bed_download.output.capture_bed,
+        fai = rules.index_genome_fasta.output.fai
+    output:
+        contig_log = "genomes/{genome_build}/capture_space/{capture_space}.check_contigs.log"
+    run:
+        # Parse BED contigs
+        bed_contigs = set()
+        with open(input.bed) as f:
+            for line in f:
+                line = line.rstrip()
+                contig = line.split("\t")[0]
+                if contig not in bed_contigs:
+                    bed_contigs.add(contig)
+
+        # Parse fai contigs
+        fai_contigs = set()
+        with open(input.fai) as f:
+            for line in f:
+                line = line.rstrip()
+                contig = line.split('\t')[0]
+                if contig not in fai_contigs:
+                    fai_contigs.add(contig)
+
+        # Check the BED file for contigs that are not in the reference genome
+        missing_contigs = list(x for x in bed_contigs if x not in fai_contigs)
+        with open(output.contig_log, "w") as o:
+            if len(missing_contigs) == 0:
+                o.write("No contigs missing from reference")
+            else:
+                o.write("The following contigs were missing from the reference\n")
+                o.write("\n".join(missing_contigs))
+            o.write("\n")
+
+
+rule compress_index_capspace_bed:
+    input:
+        capture_bed = rules.sort_and_pad_capspace.output.padded_bed
+    output:
+        bgzip_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz",
+        tabix = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz.tbi"
+    log:
+        "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.gz.log"
+    conda: CONDA_ENVS["tabix"]
+    shell:
+        op.as_one_line("""
+        bgzip -c {input.capture_bed} > {output.bgzip_bed}
+            &&
+        tabix -p bed {output.bgzip_bed}
+        """)
+
+
+rule create_interval_list:
+    input:
+        bed = rules.sort_and_pad_capspace.output.padded_bed,
+        sd = rules.create_gatk_dict.output.dict
+    output:
+        interval_list = "genomes/{genome_build}/capture_space/{capture_space}.padded.interval_list"
+    log:
+        "genomes/{genome_build}/capture_space/{capture_space}.padded.interval_list.log"
+    conda: CONDA_ENVS["gatk"]
+    shell:
+        "gatk BedToIntervalList --INPUT {input.bed} -SD {input.sd} -O {output.interval_list} > {log} 2>&1"
+
diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk
index 11d6ffa1..9bf892ad 100644
--- a/workflows/reference_files/2.4/reference_files_header.smk
+++ b/workflows/reference_files/2.4/reference_files_header.smk
@@ -44,6 +44,12 @@ VERSION_UPPER = {
     "grch38": "GRCh38",
 }
 
+GENOME_VERSION_GROUPS = {}
+for genome_build in VERSION_UPPER.keys():
+    GENOME_VERSION_GROUPS[genome_build] = []
+
+DEFAULT_CAPSPACE = {}
+
 # Check genome build versions, providers, and genome_fasta
 possible_versions = list(VERSION_UPPER.keys())
 possible_providers = ["ensembl", "ucsc", "gencode", "ncbi"]
@@ -61,6 +67,29 @@ for build_name, build_info in config["genome_builds"].items():
         f"(rather than 200): \n{build_info['genome_fasta_url']}"
     )    
 
+# Check parent genome, provider for the capture space
+for build_name, build_info in config["capture_space"].items():
+    assert "provider" in build_info and build_info["provider"] in possible_providers, (
+        f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}."
+        )
+    assert "genome" in build_info and build_info["genome"] in possible_versions,(
+        f"`genome` not set for `{build_name}` or `genome` not among {possible_versions}." )
+    assert "capture_bed_url" in build_info
+    url_code = urllib.request.urlopen(build_info["capture_bed_url"]).getcode()
+    assert url_code == 200, (
+         f"Pinging `capture_bed_url` for {build_name} returned HTTP code {url_code} "
+         f"(rather than 200): \n{build_info['capture_bed_url']}"
+        )
+    if "default" in build_info:
+        assert build_info["default"].lower() in ["true", "false"], (
+            f"true/false required for for \'default\' field"
+            )
+        build_version = build_info["genome"]
+        if build_version in DEFAULT_CAPSPACE:
+            # i.e. a default has already been specified for this genome version!
+            raise AttributeError("For reference genome version \'%s\', both \'%s\' and \'%s\' were specified as default capture spaces in the reference config" % (build_version, DEFAULT_CAPSPACE[build_version], build_name))
+        DEFAULT_CAPSPACE[build_info["genome"]] = build_name
+
 
 ##### TOOLS #####
 
@@ -464,6 +493,51 @@ def get_download_file(file):
     return get_download_file_custom
 
 
+### CAPTURE SPACE ###
+rule download_capspace_bed:
+    output:
+        capture_bed = "downloads/capture_space/{capture_space}.{genome_build}.bed"
+    log:
+        "downloads/capture_space/{capture_space}.{genome_build}.bed.log"
+    params:
+        url = lambda w: config["capture_space"][w.capture_space]["capture_bed_url"],
+        provider = lambda w: config["capture_space"][w.capture_space]["provider"]
+    shell:
+        "curl -L {params.url} > {output.capture_bed} 2> {log}"
+
+rule add_remove_chr_prefix_bed:
+    input:
+        capture_bed = rules.download_capspace_bed.output.capture_bed
+    output:
+        converted_bed = "downloads/capture_space/{capture_space}.{genome_build}.{chr_status}.bed"
+    run:
+        # Converts the specified BED file and adds/removes chr prefixes
+        if wildcards.chr_status == "chr":  # i.e. we need to add a chr prefix
+            add_chr = True
+        else:
+            add_chr = False
+
+        # Process the BED file
+        with open(input.capture_bed) as f, open(output.converted_bed, "w") as o:
+            i = 0
+            for line in f:
+                i += 1
+                # Make sure that this BED entry is chr-prefixed or not
+                if add_chr and line.startswith("chr"):
+                    # We were asked to add a chr prefix, but one already exists
+                    raise AttributeError("I was asked to add a \'chr\' prefix to \'%s\', but that BED is already \'chr\' prefixed on line %s" % (input.capture_bed, line))
+                if not add_chr and not line.startswith("chr"):
+                    # We were asked to remove a chr prefix, but there isn't one
+                    raise AttributeError("I was asked to remove a \'chr\' prefix \'%s\', but it isn't chr prefixed on line %s" % (input.capture_bed, line))
+
+                if add_chr:
+                    line = "chr" + line
+                else:
+                    line = line.replace("chr", "")
+
+                o.write(line)
+
+
 ##### SHARED #####
 
 

From e2828f8b0d2de106a710ee9cc6ecec99b8623a41 Mon Sep 17 00:00:00 2001
From: ckrushton <ckrushto@sfu.ca>
Date: Tue, 7 Sep 2021 09:52:59 -0700
Subject: [PATCH 02/38] Fixed bugs introduced during rebase, modified vcf2maf

vcf2maf now works with custom reference genomes
---
 modules/manta/2.3/manta.smk                   |  6 ++---
 modules/mutect2/2.0/mutect2.smk               | 27 ++++++++++---------
 modules/strelka/1.1/strelka.smk               |  6 ++---
 modules/vcf2maf/1.2/vcf2maf.smk               |  4 +--
 .../reference_files/2.4/config/default.yaml   |  6 +++++
 .../2.4/reference_files_header.smk            | 13 ++++++---
 6 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk
index 562489fd..7ba36487 100644
--- a/modules/manta/2.3/manta.smk
+++ b/modules/manta/2.3/manta.smk
@@ -75,7 +75,6 @@ rule _manta_configure_paired:
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]),
-        bedz = str(rules._manta_index_bed.output.bedz)
     output:
         runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
     log:
@@ -91,7 +90,7 @@ rule _manta_configure_paired:
         CFG["conda_envs"]["manta"]
     shell:
         op.as_one_line("""
-        configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {input.bedz}
+        configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {params.bedz}
         --runDir "$(dirname {output.runwf})" {params.tumour_bam_arg_name} {input.tumour_bam}
         --normalBam {input.normal_bam} --config {input.config} > {log.stdout} 2> {log.stderr}
         """)
@@ -103,7 +102,6 @@ rule _manta_configure_unpaired:
         tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]),
-        bedz = str(rules._manta_index_bed.output.bedz)
     output:
         runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
     log:
@@ -119,7 +117,7 @@ rule _manta_configure_unpaired:
         CFG["conda_envs"]["manta"]
     shell:
         op.as_one_line("""
-        configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {input.bedz}
+        configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {params.bedz}
         --runDir "$(dirname {output.runwf})" {params.tumour_bam_arg_name} {input.tumour_bam}
         --config {input.config} > {log.stdout} 2> {log.stderr}
         """)
diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 3bf5de7e..94c940b7 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -102,12 +102,12 @@ rule _mutect2_get_sm:
 # The second replaces wildcards with those used in the rule. 
 def _mutect2_get_interval_cli_arg(
     vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], 
-    interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"]
+    #interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"]
 ):
     def _mutect2_get_interval_cli_custom(wildcards, input):
         if vcf_in:
-            param = f"-L {input.candidate_positions} {interval_arg_in}"
-        else: 
+            param = f"-L {input.candidate_positions}"
+        else:
             param = ""
         return param
     return _mutect2_get_interval_cli_custom
@@ -137,7 +137,8 @@ rule _mutect2_run_matched_unmatched:
     params:
         mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8), 
         opts = CFG["options"]["mutect2_run"], 
-        interval_arg = lambda w: _mutect_get_capspace(w)
+        interval_arg = _mutect2_get_interval_cli_arg(),
+        capture_arg = lambda w: _mutect_get_capspace(w)
     conda:
         CFG["conda_envs"]["gatk"]
     threads:
@@ -150,7 +151,7 @@ rule _mutect2_run_matched_unmatched:
         -I {input.tumour_bam} -I {input.normal_bam}
         -R {input.fasta} -normal "$(cat {input.normal_sm})" -O {output.vcf}
         --germline-resource {input.gnomad} 
-        -L {wildcards.chrom} {params.interval_arg} 
+        -L {wildcards.chrom} {params.interval_arg} {params.capture_arg}
         -pon {input.pon} --f1r2-tar-gz {output.f1r2}
         > {log.stdout} 2> {log.stderr}
         """)
@@ -178,7 +179,8 @@ rule _mutect2_run_no_normal:
     params:
         mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8),
         opts = CFG["options"]["mutect2_run"], 
-        interval_arg = lambda w: _mutect_get_capspace(w) 
+        interval_arg = _mutect2_get_interval_cli_arg(),
+        capture_arg = lambda w: _mutect_get_capspace(w)
     conda:
         CFG["conda_envs"]["gatk"]
     threads:
@@ -190,7 +192,7 @@ rule _mutect2_run_no_normal:
         gatk Mutect2 --java-options "-Xmx{params.mem_mb}m" 
         {params.opts} -I {input.tumour_bam} -R {input.fasta} 
         -O {output.vcf} --germline-resource {input.gnomad} 
-        -L {wildcards.chrom} {params.interval_arg}
+        -L {wildcards.chrom} {params.interval_arg} {param.capture_arg}
         -pon {input.pon} --f1r2-tar-gz {output.f1r2}
         > {log.stdout} 2> {log.stderr}
         """)
@@ -209,15 +211,16 @@ def _mutect2_get_chr_vcfs(wildcards):
 
 def _mutect_get_capspace(wildcards):
 
-    # If this is a genome sample, return a BED file listing all chromosomes
+    # If this isn't a capture sample, we don't have a capture space, so return nothing
     if wildcards.seq_type != "capture":
-        return _mutect2_get_interval_cli_args()
+        return ""
     try:
         # Get the appropriate capture space for this sample
-        return " -L " + get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
+        cap_space = get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
+        return " -L " + cap_space
     except NameError:
-        # If we are using an older version of the reference workflow, use the same region file as the genome sample
-        return _mutect2_get_interval_cli_args()
+        # If we are using an older version of the reference workflow, we don't need to do anything
+        return ""
 
 def _mutect2_get_chr_tbis(wildcards):
     CFG = config["lcr-modules"]["mutect2"]
diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk
index fe8c429f..d07bc04f 100755
--- a/modules/strelka/1.1/strelka.smk
+++ b/modules/strelka/1.1/strelka.smk
@@ -108,7 +108,6 @@ rule _strelka_configure_paired: # Somatic
         tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
-        bedz = str(rules._strelka_index_bed.output.bedz),
         indels = str(rules._strelka_input_vcf.output.vcf) if CFG["inputs"]["candidate_small_indels"] else str(rules._strelka_dummy_vcf.output)
     output:
         runwf = CFG["dirs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
@@ -129,7 +128,7 @@ rule _strelka_configure_paired: # Somatic
         --normalBam={input.normal_bam}
         --tumorBam={input.tumour_bam}
         --referenceFasta={input.fasta}
-       --callRegions={input.bedz}
+       --callRegions={params.bedz}
         --runDir=$(dirname {output.runwf})
         {params.indel_arg}
         {params.opts} 
@@ -141,7 +140,6 @@ rule _strelka_configure_unpaired: # germline
     input:
         tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
-        bedz = str(rules._strelka_index_bed.output.bedz)
     output:
         runwf = CFG["dirs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
     log:
@@ -161,7 +159,7 @@ rule _strelka_configure_unpaired: # germline
         configureStrelkaGermlineWorkflow.py 
         --bam={input.tumour_bam}
         --referenceFasta={input.fasta}
-        --callRegions={input.bedz}
+        --callRegions={params.bedz}
         --runDir=$(dirname {output.runwf})
         {params.opts} 
         > {log.stdout} 2> {log.stderr}
diff --git a/modules/vcf2maf/1.2/vcf2maf.smk b/modules/vcf2maf/1.2/vcf2maf.smk
index d53c840b..ff352329 100644
--- a/modules/vcf2maf/1.2/vcf2maf.smk
+++ b/modules/vcf2maf/1.2/vcf2maf.smk
@@ -69,7 +69,7 @@ rule _vcf2maf_run:
         stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log",
     params:
         opts = CFG["options"]["vcf2maf"],
-        build = lambda w: VERSION_MAP[w.genome_build],
+        build = lambda w: GENOME_VERSION_MAP[w.genome_build],
         custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"])
     conda:
         CFG["conda_envs"]["vcf2maf"]
@@ -161,4 +161,4 @@ rule _vcf2maf_all:
 
 # Perform some clean-up tasks, including storing the module-specific
 # configuration on disk and deleting the `CFG` variable
-op.cleanup_module(CFG)
\ No newline at end of file
+op.cleanup_module(CFG)
diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml
index 7e6a5d26..db023ba9 100644
--- a/workflows/reference_files/2.4/config/default.yaml
+++ b/workflows/reference_files/2.4/config/default.yaml
@@ -62,6 +62,9 @@ wildcard_values:
     rm_version: ["hg19", "hg38"]
 
 tools:
+    bedtools:
+        conda_env: "envs/bedtools-2.29.2.yaml"
+        version: "2.29.2"
     coreutils: 
         conda_env: "envs/coreutils-8.31.yaml"
         version: "8.31"
@@ -71,6 +74,9 @@ tools:
     samtools: 
         conda_env: "envs/samtools-1.9.yaml"
         version: "1.9"
+    tabix:
+        conda_env: "envs/tabix-0.2.6.yaml"
+        version: "0.2.6"
     bwa: 
         conda_env: "envs/bwa-0.7.17.yaml"
         version: "0.7.17"
diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk
index 9bf892ad..a966be7a 100644
--- a/workflows/reference_files/2.4/reference_files_header.smk
+++ b/workflows/reference_files/2.4/reference_files_header.smk
@@ -45,6 +45,7 @@ VERSION_UPPER = {
 }
 
 GENOME_VERSION_GROUPS = {}
+GENOME_VERSION_MAP = {}
 for genome_build in VERSION_UPPER.keys():
     GENOME_VERSION_GROUPS[genome_build] = []
 
@@ -57,6 +58,9 @@ for build_name, build_info in config["genome_builds"].items():
     assert "version" in build_info and build_info["version"] in possible_versions, (
         f"`version` not set for `{build_name}` or `version` not among {possible_versions}."
     )
+    GENOME_VERSION_GROUPS[build_info["version"]].append(build_name)
+    upper_genome_name = VERSION_UPPER[build_info["version"]]
+    GENOME_VERSION_MAP[build_name] = upper_genome_name
     assert "provider" in build_info and build_info["provider"] in possible_providers, (
         f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}."
     )
@@ -394,10 +398,11 @@ def get_matching_download_rules(file):
         # At least one output file should produce
         num_matches = []
         for output_file in r.output:
-            assert "{version}" in output_file, (
-                f"The `{rule_name}` download rule doesn't have a `{{version}}` "
-                f"wildcard in the output file ('{output_file}')."
-            )
+            if rule_name != "download_capspace_bed":  # Workaround since we don't really care about the provider for the capture space
+                assert "{version}" in output_file, (
+                    f"The `{rule_name}` download rule doesn't have a `{{version}}` "
+                    f"wildcard in the output file ('{output_file}')."
+                )
             matches = smk.io.glob_wildcards(output_file, [file])
             num_matches.append(len(matches[0]))
         if any(num > 0 for num in num_matches):

From 52d74823eb31e65955f5c2afc3eba4ef548babe0 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Fri, 3 Dec 2021 16:09:06 -0800
Subject: [PATCH 03/38] drop unnecessary change

---
 modules/slms_3/1.0/config/default.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/slms_3/1.0/config/default.yaml b/modules/slms_3/1.0/config/default.yaml
index 718fb8f2..32157c8e 100644
--- a/modules/slms_3/1.0/config/default.yaml
+++ b/modules/slms_3/1.0/config/default.yaml
@@ -110,7 +110,7 @@ lcr-modules:
             scratch_subdirectories: []
 
             options:
-                sage_run: "-validation_stringency LENIENT"
+                sage_run: ""
 
             resources:
                 sage_run:

From eb56fe54ec7c5f479e4948daae70a4fc99456665 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Mon, 6 Dec 2021 12:17:32 -0800
Subject: [PATCH 04/38] allow the widcard switch for custom bed in lofreq

---
 modules/lofreq/1.1/lofreq.smk | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
index 0a65b358..a6cde654 100644
--- a/modules/lofreq/1.1/lofreq.smk
+++ b/modules/lofreq/1.1/lofreq.smk
@@ -60,15 +60,16 @@ localrules:
 
 def _lofreq_get_capspace(wildcards):
 
+    custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type]
     # If this is a genome sample, return a BED file listing all chromosomes
     if wildcards.seq_type != "capture":
-        return str(config["lcr-modules"]["lofreq"]["switches"]["regions_bed"]['_default'])
+        return str(custom_bed) if custom_bed else str(bed)
     try:
         # Get the appropriate capture space for this sample
         return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed")
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample 
-        return str(config["lcr-modules"]["lofreq"]["switches"]["regions_bed"]['_default'])
+        return str(custom_bed) if custom_bed else str(bed)
 
 ##### RULES #####
 

From 79b5286a1edf63f12d2f588b12bc94bea0a242c8 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Mon, 6 Dec 2021 12:21:31 -0800
Subject: [PATCH 05/38] remove commented line used for testing purposes

---
 modules/mutect2/2.0/mutect2.smk | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 52ff9fcb..0bfdc14e 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -123,8 +123,7 @@ rule _mutect2_get_sm:
 # The first function loads the wildcard-containing file path and additional args from the config. 
 # The second replaces wildcards with those used in the rule. 
 def _mutect2_get_interval_cli_arg(
-    vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], 
-    #interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"]
+    vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"]
 ):
     def _mutect2_get_interval_cli_custom(wildcards, input):
         if vcf_in:

From 57adf033c256b2ec11e3576e67e93ec2e17aa615 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Mon, 6 Dec 2021 12:26:14 -0800
Subject: [PATCH 06/38] rename global variable to be module-specific

---
 modules/vcf2maf/1.2/vcf2maf.smk | 4 ++--
 modules/vcf2maf/1.3/vcf2maf.smk | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/vcf2maf/1.2/vcf2maf.smk b/modules/vcf2maf/1.2/vcf2maf.smk
index 2b357647..6d5e48b0 100644
--- a/modules/vcf2maf/1.2/vcf2maf.smk
+++ b/modules/vcf2maf/1.2/vcf2maf.smk
@@ -51,7 +51,7 @@ localrules:
     _vcf2maf_crossmap,
     _vcf2maf_all
 
-VERSION_MAP = {
+VCF2MAF_GENOME_VERSION_MAP = {
     "grch37": "GRCh37",
     "hg38": "GRCh38",
     "hs37d5": "GRCh37"
@@ -89,7 +89,7 @@ rule _vcf2maf_run:
         stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log",
     params:
         opts = CFG["options"]["vcf2maf"],
-        build = lambda w: GENOME_VERSION_MAP[w.genome_build],
+        build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build],
         custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"])
     conda:
         CFG["conda_envs"]["vcf2maf"]
diff --git a/modules/vcf2maf/1.3/vcf2maf.smk b/modules/vcf2maf/1.3/vcf2maf.smk
index 17ba2f69..4a38f18d 100644
--- a/modules/vcf2maf/1.3/vcf2maf.smk
+++ b/modules/vcf2maf/1.3/vcf2maf.smk
@@ -31,7 +31,7 @@ localrules:
     _vcf2maf_crossmap,
     _vcf2maf_all
 
-VERSION_MAP = {
+VCF2MAF_GENOME_VERSION_MAP = {
     "grch37": "GRCh37",
     "hg38": "GRCh38",
     "hs37d5": "GRCh37"
@@ -83,7 +83,7 @@ rule _vcf2maf_run:
         stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log",
     params:
         opts = CFG["options"]["vcf2maf"],
-        build = lambda w: VERSION_MAP[w.genome_build],
+        build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build],
         custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"])
     conda:
         CFG["conda_envs"]["vcf2maf"]

From 7611f23ec214bd7d646e24bd34ab4d9a34b213de Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Mon, 6 Dec 2021 12:50:00 -0800
Subject: [PATCH 07/38] make global wildcard constraints rule-specific

---
 modules/strelka/1.1/strelka.smk | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk
index 7de97e6a..439d17ae 100755
--- a/modules/strelka/1.1/strelka.smk
+++ b/modules/strelka/1.1/strelka.smk
@@ -53,10 +53,7 @@ localrules:
     _strelka_configure_unpaired,
     _strelka_filter_combine,
     _strelka_output_filtered_vcf,
-    _strelka_all,
-
-wildcard_constraints: 
-    var_type = "somatic.snvs|somatic.indels|variants"
+    _strelka_all
 
 ##### RULES #####
 
@@ -266,6 +263,8 @@ rule _strelka_filter_combine:
     log:
         stdout = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_filter_combine.stdout.log",
         stderr = CFG["logs"]["strelka"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/strelka_filter_combine.stderr.log"
+    wildcard_constraints:
+        var_type = "somatic.snvs|somatic.indels|variants"
     shell:
         op.as_one_line("""
         bcftools concat -a {input.vcf} | 

From 783515cddefdb6c23f361108a5467823a7a447a5 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Mon, 6 Dec 2021 13:48:32 -0800
Subject: [PATCH 08/38] handle edge case when dictionary key is missing

---
 modules/lofreq/1.1/lofreq.smk | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
index a6cde654..ea92050a 100644
--- a/modules/lofreq/1.1/lofreq.smk
+++ b/modules/lofreq/1.1/lofreq.smk
@@ -60,7 +60,10 @@ localrules:
 
 def _lofreq_get_capspace(wildcards):
 
-    custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type]
+    if str(wildcards.seq_type) in config["lcr-modules"]["lofreq"]['switches']['regions_bed'].keys():
+        custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type]
+    else:
+        custom_bed = bed
     # If this is a genome sample, return a BED file listing all chromosomes
     if wildcards.seq_type != "capture":
         return str(custom_bed) if custom_bed else str(bed)

From e179f92f4722508048a0060de2ae7d58ab7f9fab Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 7 Dec 2021 11:41:41 -0800
Subject: [PATCH 09/38] implement fix for empty vcf files from previous version

---
 modules/vcf2maf/1.3/vcf2maf.smk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/vcf2maf/1.3/vcf2maf.smk b/modules/vcf2maf/1.3/vcf2maf.smk
index 4a38f18d..1cf2a63b 100644
--- a/modules/vcf2maf/1.3/vcf2maf.smk
+++ b/modules/vcf2maf/1.3/vcf2maf.smk
@@ -115,7 +115,8 @@ rule _vcf2maf_run:
             --custom-enst {params.custom_enst}
             --retain-info gnomADg_AF
             >> {log.stdout} 2>> {log.stderr};
-        else echo "WARNING: PATH is not set properly, using $(which vcf2maf.pl) will result in error during execution. Please ensure $VCF2MAF_SCRIPT exists." > {log.stderr};fi
+        else echo "WARNING: PATH is not set properly, using $(which vcf2maf.pl) will result in error during execution. Please ensure $VCF2MAF_SCRIPT exists." > {log.stderr};fi  &&
+        touch {output.vep}
         """)
 
 rule _vcf2maf_gnomad_filter_maf:

From 0028ec5eca05915662076cf161d77abfb5ccb4cb Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 7 Dec 2021 12:46:05 -0800
Subject: [PATCH 10/38] add missing genome build to config

---
 workflows/reference_files/2.4/config/default.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml
index a8fb6fd0..ee4ece6e 100644
--- a/workflows/reference_files/2.4/config/default.yaml
+++ b/workflows/reference_files/2.4/config/default.yaml
@@ -57,6 +57,11 @@ genome_builds:
         version: "grch38"
         provider: "ucsc"
         genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/grch38-nci.fa"
+    hg38-panea:
+        # Version of hg38 used by Panea et al.
+        version: "grch38"
+        provider: "ucsc"
+        genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg38-panea.fa"
 
 capture_space:
     exome-utr-grch38:

From 7abd27f9fc65564de7542e31ab7ae4d90d638a9a Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Thu, 30 Dec 2021 08:23:32 -0800
Subject: [PATCH 11/38] handle panea sdf files

---
 modules/starfish/2.0/starfish.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/starfish/2.0/starfish.smk b/modules/starfish/2.0/starfish.smk
index d3213e5a..9ead0453 100644
--- a/modules/starfish/2.0/starfish.smk
+++ b/modules/starfish/2.0/starfish.smk
@@ -112,7 +112,7 @@ rule _starfish_run:
             CFG["dirs"]["inputs"] + "vcf/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{caller}.vcf.gz.tbi", 
             caller = callers
             ),
-        reference = ancient(reference_files("genomes/{genome_build}/sdf")),
+        reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build=="hg38-panea" else ancient(reference_files("genomes/hg38/sdf")),
         starfish_script = CFG["inputs"]["starfish_script"]
     output:
         complete = touch(CFG["dirs"]["starfish"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/starfish.complete"), 

From 4b7d69e31c8fa31f1f8203f39cbcad92a947eacc Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Thu, 30 Dec 2021 08:29:40 -0800
Subject: [PATCH 12/38] unwrap reference file from str

---
 modules/lofreq/1.1/lofreq.smk | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
index ea92050a..ea5eb997 100644
--- a/modules/lofreq/1.1/lofreq.smk
+++ b/modules/lofreq/1.1/lofreq.smk
@@ -59,20 +59,21 @@ localrules:
     _lofreq_link_to_preprocessed
 
 def _lofreq_get_capspace(wildcards):
-
-    if str(wildcards.seq_type) in config["lcr-modules"]["lofreq"]['switches']['regions_bed'].keys():
-        custom_bed = config["lcr-modules"]["lofreq"]['switches']['regions_bed'][wildcards.seq_type]
+    CFG=config["lcr-modules"]["lofreq"]
+    default_bed = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
+    if str(wildcards.seq_type) in CFG['switches']['regions_bed'].keys():
+        custom_bed = CFG['switches']['regions_bed'][wildcards.seq_type]
     else:
-        custom_bed = bed
+        custom_bed = default_bed
     # If this is a genome sample, return a BED file listing all chromosomes
     if wildcards.seq_type != "capture":
-        return str(custom_bed) if custom_bed else str(bed)
+        return custom_bed if custom_bed else default_bed
     try:
         # Get the appropriate capture space for this sample
         return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed")
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample 
-        return str(custom_bed) if custom_bed else str(bed)
+        return custom_bed if custom_bed else default_bed
 
 ##### RULES #####
 

From fd3acfe2e365bcfcfd7901ad55ea7da828194677 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Fri, 7 Jan 2022 15:47:48 -0800
Subject: [PATCH 13/38] rename grch38-nci into ng38-nci

---
 modules/starfish/2.0/starfish.smk                 | 2 +-
 workflows/reference_files/2.4/config/default.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/starfish/2.0/starfish.smk b/modules/starfish/2.0/starfish.smk
index 9ead0453..f63d6988 100644
--- a/modules/starfish/2.0/starfish.smk
+++ b/modules/starfish/2.0/starfish.smk
@@ -112,7 +112,7 @@ rule _starfish_run:
             CFG["dirs"]["inputs"] + "vcf/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{caller}.vcf.gz.tbi", 
             caller = callers
             ),
-        reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build=="hg38-panea" else ancient(reference_files("genomes/hg38/sdf")),
+        reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build in ["hg38-panea", "hg38-nci"] else ancient(reference_files("genomes/hg38/sdf")),
         starfish_script = CFG["inputs"]["starfish_script"]
     output:
         complete = touch(CFG["dirs"]["starfish"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/starfish.complete"), 
diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml
index ee4ece6e..a3232434 100644
--- a/workflows/reference_files/2.4/config/default.yaml
+++ b/workflows/reference_files/2.4/config/default.yaml
@@ -52,7 +52,7 @@ genome_builds:
         version: "grch37"
         provider: "ucsc"
         genome_fasta_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/hg19-reddy.fa"
-    grch38-nci:
+    hg38-nci:
         # NCI's version of GRCh38, with a costco-sized flat pack of decoys
         version: "grch38"
         provider: "ucsc"

From c2fdf033378d8e622a8e6a89c057c06773b00352 Mon Sep 17 00:00:00 2001
From: ckrushton <ckrushto@sfu.ca>
Date: Mon, 10 Jan 2022 10:20:02 -0800
Subject: [PATCH 14/38] Added support for generating SDF files with additional
 genome builds

---
 .../reference_files/2.4/reference_files.smk   |  4 +--
 .../2.4/reference_files_header.smk            | 33 +++++++++++++------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk
index 1745fb96..e12abc24 100644
--- a/workflows/reference_files/2.4/reference_files.smk
+++ b/workflows/reference_files/2.4/reference_files.smk
@@ -97,7 +97,7 @@ rule get_sdf_refs:
     output: 
         sdf = directory("genomes/{genome_build}/sdf")
     wildcard_constraints: 
-        genome_build = "hg38|hg19|grch37|hs37d5|hg19-reddy"
+        genome_build = "|".join(SDF_GENOME_BUILDS)
     shell: 
         "ln -srfT {input.sdf} {output.sdf}"
 
@@ -428,7 +428,7 @@ def get_capture_space(sample_id, genome_build, seq_type, return_ext):
     panel = sample.iloc[0]['capture_space']
 
     # If this panel is "none" (aka not specified) use the default for this reference genome
-    if panel == "none":
+    if panel == "none" or panel is None or panel == "":
         # Get the provider for this version of the reference genome
         genome_version = None
         for version, builds in GENOME_VERSION_GROUPS.items():
diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk
index a966be7a..a5372e87 100644
--- a/workflows/reference_files/2.4/reference_files_header.smk
+++ b/workflows/reference_files/2.4/reference_files_header.smk
@@ -45,10 +45,20 @@ VERSION_UPPER = {
 }
 
 GENOME_VERSION_GROUPS = {}
-GENOME_VERSION_MAP = {}
+VCF2MAF_GENOME_VERSION_MAP = {}
 for genome_build in VERSION_UPPER.keys():
     GENOME_VERSION_GROUPS[genome_build] = []
 
+# For Starfish SDF files
+SDF_VERSION_MAP = {}
+SDF_GENOME_BUILDS = []
+SDF_IGNORE = {"grch38", "grch38-legacy"}  # Ignore non-chr prefixed versions of hg38 since we don't use them
+sdf_genome_mappings = {
+"GRCh37": {"ensembl": "1000g_v37_phase2.sdf", "ucsc": "hg19.sdf"},
+"GRCh38": {"ucsc": "GRCh38.sdf"}
+}
+
+
 DEFAULT_CAPSPACE = {}
 
 # Check genome build versions, providers, and genome_fasta
@@ -60,8 +70,9 @@ for build_name, build_info in config["genome_builds"].items():
     )
     GENOME_VERSION_GROUPS[build_info["version"]].append(build_name)
     upper_genome_name = VERSION_UPPER[build_info["version"]]
-    GENOME_VERSION_MAP[build_name] = upper_genome_name
-    assert "provider" in build_info and build_info["provider"] in possible_providers, (
+    VCF2MAF_GENOME_VERSION_MAP[build_name] = upper_genome_name
+    genome_provider = build_info["provider"]
+    assert "provider" in build_info and genome_provider in possible_providers, (
         f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}."
     )
     assert "genome_fasta_url" in build_info, f"`genome_fasta_url` not set for `{build_name}`."
@@ -70,6 +81,13 @@ for build_name, build_info in config["genome_builds"].items():
         f"Pinging `genome_fasta_url` for {build_name} returned HTTP code {url_code} "
         f"(rather than 200): \n{build_info['genome_fasta_url']}"
     )    
+    # Find the appropriate SDF file for this genome build
+    if build_name not in SDF_IGNORE:
+        SDF_GENOME_BUILDS.append(build_name)
+        try:
+            SDF_VERSION_MAP[build_name] = sdf_genome_mappings[upper_genome_name][genome_provider]
+        except KeyError as e:
+            raise AttributeError(f"Unable to locate a Starfish SDF file for genome build \'{upper_genome_name}\' and provider \'{genome_provider}\'") from e
 
 # Check parent genome, provider for the capture space
 for build_name, build_info in config["capture_space"].items():
@@ -366,13 +384,8 @@ rule download_liftover_chains:
 rule download_sdf: 
     output: 
         sdf = directory("downloads/sdf/{genome_build}/sdf")
-    params: 
-        build = lambda w: {
-            "grch37": "1000g_v37_phase2.sdf", 
-            "hs37d5": "1000g_v37_phase2.sdf",
-            "hg19": "hg19.sdf", 
-            "hg38": "GRCh38.sdf"
-        }[w.genome_build]
+    params:
+        build = lambda w: SDF_VERSION_MAP[w.genome_build]
     shell: 
         op.as_one_line("""
         wget -qO {output.sdf}.zip https://s3.amazonaws.com/rtg-datasets/references/{params.build}.zip && 

From b956c089dfec111f2b63ba8f42c2c422b9e7eb7d Mon Sep 17 00:00:00 2001
From: ckrushton <ckrushto@sfu.ca>
Date: Mon, 10 Jan 2022 10:20:44 -0800
Subject: [PATCH 15/38] Added support for additional genome builds

---
 modules/vcf2maf/1.2/vcf2maf.smk | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/vcf2maf/1.2/vcf2maf.smk b/modules/vcf2maf/1.2/vcf2maf.smk
index ff352329..9163c5e9 100644
--- a/modules/vcf2maf/1.2/vcf2maf.smk
+++ b/modules/vcf2maf/1.2/vcf2maf.smk
@@ -69,7 +69,7 @@ rule _vcf2maf_run:
         stderr = CFG["logs"]["vcf2maf"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{base_name}_vcf2maf.stderr.log",
     params:
         opts = CFG["options"]["vcf2maf"],
-        build = lambda w: GENOME_VERSION_MAP[w.genome_build],
+        build = lambda w: VCF2MAF_GENOME_VERSION_MAP[w.genome_build],
         custom_enst = op.switch_on_wildcard("genome_build", CFG["switches"]["custom_enst"])
     conda:
         CFG["conda_envs"]["vcf2maf"]
@@ -91,7 +91,8 @@ rule _vcf2maf_run:
         --vep-data {input.vep_cache}
         --vep-path $vepPATH {params.opts}
         --custom-enst {params.custom_enst}
-        > {log.stdout} 2> {log.stderr}
+        > {log.stdout} 2> {log.stderr} &&
+        touch {output.vep}
         """)
 
 

From 5d08414fed199152c77b3554bee5efc55d21fb60 Mon Sep 17 00:00:00 2001
From: ckrushton <ckrushto@sfu.ca>
Date: Mon, 10 Jan 2022 10:51:09 -0800
Subject: [PATCH 16/38] Ignore grch38-masked when generating SDF files

---
 workflows/reference_files/2.4/reference_files_header.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk
index 0f6f0073..74e4c028 100644
--- a/workflows/reference_files/2.4/reference_files_header.smk
+++ b/workflows/reference_files/2.4/reference_files_header.smk
@@ -52,7 +52,7 @@ for genome_build in VERSION_UPPER.keys():
 # For Starfish SDF files
 SDF_VERSION_MAP = {}
 SDF_GENOME_BUILDS = []
-SDF_IGNORE = {"grch38", "grch38-legacy"}  # Ignore non-chr prefixed versions of hg38 since we don't use them
+SDF_IGNORE = {"grch38", "grch38-legacy", "grch38_masked"}  # Ignore non-chr prefixed versions of hg38 since we don't use them
 sdf_genome_mappings = {
 "GRCh37": {"ensembl": "1000g_v37_phase2.sdf", "ucsc": "hg19.sdf"},
 "GRCh38": {"ucsc": "GRCh38.sdf"}

From da52375a76d03c99cbe7f2b208258285ab929c47 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 05:40:13 -0800
Subject: [PATCH 17/38] drop vcf2maf from variable name

---
 workflows/reference_files/2.4/reference_files_header.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk
index 74e4c028..9c1581d3 100644
--- a/workflows/reference_files/2.4/reference_files_header.smk
+++ b/workflows/reference_files/2.4/reference_files_header.smk
@@ -45,7 +45,7 @@ VERSION_UPPER = {
 }
 
 GENOME_VERSION_GROUPS = {}
-VCF2MAF_GENOME_VERSION_MAP = {}
+GENOME_VERSION_MAP = {}
 for genome_build in VERSION_UPPER.keys():
     GENOME_VERSION_GROUPS[genome_build] = []
 
@@ -70,7 +70,7 @@ for build_name, build_info in config["genome_builds"].items():
     )
     GENOME_VERSION_GROUPS[build_info["version"]].append(build_name)
     upper_genome_name = VERSION_UPPER[build_info["version"]]
-    VCF2MAF_GENOME_VERSION_MAP[build_name] = upper_genome_name
+    GENOME_VERSION_MAP[build_name] = upper_genome_name
     genome_provider = build_info["provider"]
     assert "provider" in build_info and genome_provider in possible_providers, (
         f"`provider` not set for `{build_name}` or `provider` not among {possible_providers}."

From d05536418f3ae2b87e3db7bcb34ae7b99ce5ef81 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 05:44:02 -0800
Subject: [PATCH 18/38] add capspace function to oncopipe

---
 oncopipe/oncopipe/__init__.py | 49 +++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/oncopipe/oncopipe/__init__.py b/oncopipe/oncopipe/__init__.py
index cc08d043..fe3d828f 100755
--- a/oncopipe/oncopipe/__init__.py
+++ b/oncopipe/oncopipe/__init__.py
@@ -1781,3 +1781,52 @@ def cleanup_module(module_config):
     # Add back the TSV fields
     for field in tsv_fields.keys():
         module_config[field] = tsv_fields[field]
+
+
+# Kostia functions
+def get_capture_space(module_config, sample_id, genome_build, seq_type, return_ext):
+    """Returns path to the file with capspace to be used for  genome build.
+
+    Parameters
+    ----------
+    module_config : dict
+        The module-specific configuration.
+    sample_id : str
+        The id for a specific sample for which capture space should be returned.
+        Allows for both normal and tumour id.
+    genome_build : str
+        The specific genome build for which to return capture space.
+        Allows to unambiguously handle samples aligned to different genome versions.
+    seq_type : str
+        The soecific seq type for which to return capture space.
+    return_ext : str
+        The extension of the capture space file to be returned (.bed, .vcf, .vcf.gz).
+
+    Returns
+    -------
+    str
+        The path to a file relative to the reference files parental directory.
+    """
+
+    # Convenient variable to access sample table
+    module_samples = module_config["samples"]
+
+    this_sample = module_samples.loc[(module_samples['sample_id'] == sample_id) &
+            (module_samples['genome_build'] == genome_build) &
+            (module_samples['seq_type'] == seq_type)]
+
+    if len(this_sample) != 1:
+        raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type))
+    panel = this_sample.iloc[0]['capture_space']
+
+    # If this panel is "none" (aka not specified) use the default for this reference genome
+    if panel.upper() in (name.upper() for name in ['none', "na", "n/a", ""]):
+        try:
+            if "38" in genome_build:
+                panel = "exome-utr-grch38"
+            else:
+                panel = "exome-utr-grch37"
+        except KeyError as e:
+            raise AttributeError("No default capture space was specified for genome version \'%s\'. You can specify a default by setting \'default=\'true\'\' in a \'%s\'-based capture space in the reference config" % (genome_version, genome_version)) from e
+    # Now that we have found the corresponding capture region for this sample, obtain the requested file
+    return "genomes/" + genome_build + "/capture_space/" + panel + ".padded." + return_ext

From 1f00b34ceb591cd036d0e8dad80d106e9b248e6b Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 05:46:20 -0800
Subject: [PATCH 19/38] drop redundant function from reference files

---
 .../reference_files/2.4/reference_files.smk   | 34 -------------------
 1 file changed, 34 deletions(-)

diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk
index 39088241..7962f0d2 100644
--- a/workflows/reference_files/2.4/reference_files.smk
+++ b/workflows/reference_files/2.4/reference_files.smk
@@ -459,40 +459,6 @@ def _check_capspace_provider(w):
         return {'bed': expand(rules.add_remove_chr_prefix_bed.output.converted_bed, capture_space = capture_space, genome_build = w.genome_build, chr_status= chr_status)}
 
 
-def get_capture_space(sample_id, genome_build, seq_type, return_ext):
-    """
-    Obtain the corresponding capture space file (gz, interval list etc) for the specified sample
-    """
-
-    # Get the corresponding capture space for a given sample
-    sample_table = CFG["samples"]
-    sample = sample_table.loc[(sample_table['sample_id'] == sample_id) &
-            (sample_table['genome_build'] == genome_build) &
-            (sample_table['seq_type'] == seq_type)]
-    if len(sample) != 1:
-        raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type))
-    panel = sample.iloc[0]['capture_space']
-
-    # If this panel is "none" (aka not specified) use the default for this reference genome
-    if panel == "none" or panel is None or panel == "":
-        # Get the provider for this version of the reference genome
-        genome_version = None
-        for version, builds in GENOME_VERSION_GROUPS.items():
-            if genome_build in builds:
-                assert genome_version is None  # Failsafe, but this should never happen, as the reference workflow will not tolerate duplicates (I think)
-                genome_version = version
-        if genome_version is None:
-            raise AttributeError("Unable to find the corresponding genome version (ex. GRCh37) for build \'%s\'" % genome_build)
-        
-        try:
-            panel = DEFAULT_CAPSPACE[genome_version]
-        except KeyError as e:
-            raise AttributeError("No default capture space was specified for genome version \'%s\'. You can specify a default by setting \'default=\'true\'\' in a \'%s\'-based capture space in the reference config" % (genome_version, genome_version)) from e
-
-    # Now that we have found the corresponding capture region for this sample, obtain the requested file
-    return reference_files("genomes/" + genome_build + "/capture_space/" + panel + ".padded." + return_ext)
-
-
 rule get_capspace_bed_download:
     input:
         unpack(_check_capspace_provider)

From 9ac63c989975eeb55aedbc198cbd30a3bca355e4 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 06:02:22 -0800
Subject: [PATCH 20/38] switch modules to oncopipe finction

---
 modules/lofreq/1.1/lofreq.smk   |  9 +++++++--
 modules/manta/2.3/manta.smk     | 10 ++++++----
 modules/mutect2/2.0/mutect2.smk |  4 ++--
 modules/sage/1.0/sage.smk       | 10 ++++++----
 modules/strelka/1.1/strelka.smk | 11 +++++++----
 5 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
index ea5eb997..8f120b93 100644
--- a/modules/lofreq/1.1/lofreq.smk
+++ b/modules/lofreq/1.1/lofreq.smk
@@ -69,11 +69,16 @@ def _lofreq_get_capspace(wildcards):
     if wildcards.seq_type != "capture":
         return custom_bed if custom_bed else default_bed
     try:
+        if "tumour_id" in wildcards.keys():
         # Get the appropriate capture space for this sample
-        return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed")
+            this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed")
+        else:
+            this_bed = op.get_capture_space(CFG, wildcards.normal_id, wildcards.genome_build, wildcards.seq_type, "bed")
+        this_bed = reference_files(this_bed)
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample 
-        return custom_bed if custom_bed else default_bed
+        this_bed = custom_bed if custom_bed else default_bed
+    return this_bed
 
 ##### RULES #####
 
diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk
index 7ee497cb..e52fdcc8 100644
--- a/modules/manta/2.3/manta.smk
+++ b/modules/manta/2.3/manta.smk
@@ -87,16 +87,18 @@ rule _manta_index_bed:
         """)
 
 def _manta_get_capspace(wildcards):
-
+    CFG = config["lcr-modules"]["manta"]
     # If this is a genome sample, return a BED file listing all chromosomes
     if wildcards.seq_type != "capture":
-        return str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+        this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
     try:
         # Get the appropriate capture space for this sample
-        return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = reference_files(this_bed)
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample
-        return str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+        this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+    return this_bed
 
 # Configures the manta workflow with the input BAM files and reference FASTA file.
 rule _manta_configure_paired:
diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 0bfdc14e..48ad27f0 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -135,13 +135,13 @@ def _mutect2_get_interval_cli_arg(
 
 
 def _mutect_get_capspace(wildcards):
-
+    CFG = config["lcr-modules"]["mutect2"]
     # If this isn't a capture sample, we don't have a capture space, so return nothing
     if wildcards.seq_type != "capture":
         return []
     try:
         # Get the appropriate capture space for this sample
-        cap_space = get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
+        cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
         return " -L " + cap_space
     except NameError:
         # If we are using an older version of the reference workflow, we don't need to do anything
diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk
index cbb69a38..f274b8b0 100644
--- a/modules/sage/1.0/sage.smk
+++ b/modules/sage/1.0/sage.smk
@@ -122,16 +122,18 @@ def get_chromosomes(wildcards):
     return chromosomes
 
 def _sage_get_capspace(wildcards):
-
+    CFG = config["lcr-modules"]["sage"]
     # If this is a genome sample, return a BED file listing all chromosomes
     if wildcards.seq_type != "capture":
-        return rules._download_sage_references.output.panel_bed
+        this_bed = rules._download_sage_references.output.panel_bed
     try:
         # Get the appropriate capture space for this sample
-        return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = reference_files(this_bed)
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample
-        return rules._download_sage_references.output.panel_bed
+        this_bed = rules._download_sage_references.output.panel_bed
+    return this_bed
 
 # Variant calling rule
 rule _run_sage:
diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk
index 439d17ae..db8b9465 100755
--- a/modules/strelka/1.1/strelka.smk
+++ b/modules/strelka/1.1/strelka.smk
@@ -120,16 +120,19 @@ def _strelka_get_indel_cli_arg(vcf_in = config["lcr-modules"]["strelka"]["inputs
     return _strelka_get_indel_cli_custom
 
 def _strelka_get_capspace(wildcards):
-
+    CFG = config["lcr-modules"]["strelka"]
     # If this is a genome sample, return a BED file listing all chromosomes
     if wildcards.seq_type != "capture":
-        return str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+        this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
     try:
         # Get the appropriate capture space for this sample
-        return get_capture_space(wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = reference_files(this_bed)
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample
-        return str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+        this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+
+    return this_bed
 
 
 rule _strelka_configure_paired: # Somatic

From 511f4d4efe735cb48f19bdbcece0f58a606e9f9a Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 06:05:30 -0800
Subject: [PATCH 21/38] add new column to demo sample table

---
 demo/data/samples.tsv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/demo/data/samples.tsv b/demo/data/samples.tsv
index 158f6972..5e419226 100755
--- a/demo/data/samples.tsv
+++ b/demo/data/samples.tsv
@@ -1,6 +1,6 @@
-sample_id	seq_type	patient_id	tissue_status	genome_build	strand	read_length
-TCRBOA7-N-WEX	capture	TCRBOA7	normal	grch37	positive	100
-TCRBOA7-T-WEX	capture	TCRBOA7	tumour	grch37	positive	100
-TCRBOA7-T-RNA	mrna	TCRBOA7	tumour	grch37	positive	100
-TCRBOA7-N-WGS	genome	TCRBOA7	normal	grch37	positive	100
-TCRBOA7-T-WGS	genome	TCRBOA7	tumour	grch37	positive	100
+sample_id	seq_type	patient_id	tissue_status	genome_build	strand	read_length	capture_space
+TCRBOA7-N-WEX	capture	TCRBOA7	normal	grch37	positive	100	none
+TCRBOA7-T-WEX	capture	TCRBOA7	tumour	grch37	positive	100	none
+TCRBOA7-T-RNA	mrna	TCRBOA7	tumour	grch37	positive	100	none
+TCRBOA7-N-WGS	genome	TCRBOA7	normal	grch37	positive	100	none
+TCRBOA7-T-WGS	genome	TCRBOA7	tumour	grch37	positive	100	none

From 163a80b73536454beffa1698a90ce6edbe9341ad Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 09:10:20 -0800
Subject: [PATCH 22/38] add missing env files

---
 workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml | 1 +
 workflows/reference_files/2.4/envs/tabix-0.2.6.yaml     | 1 +
 2 files changed, 2 insertions(+)
 create mode 120000 workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml
 create mode 120000 workflows/reference_files/2.4/envs/tabix-0.2.6.yaml

diff --git a/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml b/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml
new file mode 120000
index 00000000..c185b12a
--- /dev/null
+++ b/workflows/reference_files/2.4/envs/bedtools-2.29.2.yaml
@@ -0,0 +1 @@
+../../../../envs/bedtools/bedtools-2.29.2.yaml
\ No newline at end of file
diff --git a/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml b/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml
new file mode 120000
index 00000000..bcf351f2
--- /dev/null
+++ b/workflows/reference_files/2.4/envs/tabix-0.2.6.yaml
@@ -0,0 +1 @@
+../../../../envs/tabix/tabix-0.2.6.yaml
\ No newline at end of file

From b62b84b69a39b2f5b260c567a4a1d62898fb9050 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 14:30:27 -0800
Subject: [PATCH 23/38] revert erroneous error in mutect2

---
 modules/mutect2/2.0/mutect2.smk | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 48ad27f0..603990ee 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -123,11 +123,12 @@ rule _mutect2_get_sm:
 # The first function loads the wildcard-containing file path and additional args from the config. 
 # The second replaces wildcards with those used in the rule. 
 def _mutect2_get_interval_cli_arg(
-    vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"]
+    vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"],
+    interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"]
 ):
     def _mutect2_get_interval_cli_custom(wildcards, input):
         if vcf_in:
-            param = f"-L {input.candidate_positions}"
+            param = f"-L {input.candidate_positions} {interval_arg_in}"
         else:
             param = ""
         return param
@@ -142,7 +143,8 @@ def _mutect_get_capspace(wildcards):
     try:
         # Get the appropriate capture space for this sample
         cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
-        return " -L " + cap_space
+        cap_space = reference_files(cap_space)
+        return cap_space
     except NameError:
         # If we are using an older version of the reference workflow, we don't need to do anything
         return []
@@ -186,7 +188,7 @@ rule _mutect2_run_matched_unmatched:
         -I {input.tumour_bam} -I {input.normal_bam}
         -R {input.fasta} -normal "$(cat {input.normal_sm})" -O {output.vcf}
         --germline-resource {input.gnomad} 
-        -L {wildcards.chrom} {params.interval_arg} {input.capture_arg}
+        -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg}
         -pon {input.pon} --f1r2-tar-gz {output.f1r2}
         > {log.stdout} 2> {log.stderr}
         """)

From 9cb922c81d77696fbeb3289ff53ecef52414d01c Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 18:56:03 -0800
Subject: [PATCH 24/38] reorder if blocks

---
 modules/lofreq/1.1/lofreq.smk   |  6 +++---
 modules/manta/2.3/manta.smk     |  6 +++---
 modules/mutect2/2.0/mutect2.smk | 13 +++++++------
 modules/sage/1.0/sage.smk       |  6 +++---
 modules/strelka/1.1/strelka.smk |  6 +++---
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
index 8f120b93..895cc662 100644
--- a/modules/lofreq/1.1/lofreq.smk
+++ b/modules/lofreq/1.1/lofreq.smk
@@ -65,9 +65,6 @@ def _lofreq_get_capspace(wildcards):
         custom_bed = CFG['switches']['regions_bed'][wildcards.seq_type]
     else:
         custom_bed = default_bed
-    # If this is a genome sample, return a BED file listing all chromosomes
-    if wildcards.seq_type != "capture":
-        return custom_bed if custom_bed else default_bed
     try:
         if "tumour_id" in wildcards.keys():
         # Get the appropriate capture space for this sample
@@ -78,6 +75,9 @@ def _lofreq_get_capspace(wildcards):
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample 
         this_bed = custom_bed if custom_bed else default_bed
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        return custom_bed if custom_bed else default_bed
     return this_bed
 
 ##### RULES #####
diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk
index e52fdcc8..7a009701 100644
--- a/modules/manta/2.3/manta.smk
+++ b/modules/manta/2.3/manta.smk
@@ -88,9 +88,6 @@ rule _manta_index_bed:
 
 def _manta_get_capspace(wildcards):
     CFG = config["lcr-modules"]["manta"]
-    # If this is a genome sample, return a BED file listing all chromosomes
-    if wildcards.seq_type != "capture":
-        this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
     try:
         # Get the appropriate capture space for this sample
         this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
@@ -98,6 +95,9 @@ def _manta_get_capspace(wildcards):
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample
         this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
     return this_bed
 
 # Configures the manta workflow with the input BAM files and reference FASTA file.
diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 603990ee..d5fff996 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -137,17 +137,18 @@ def _mutect2_get_interval_cli_arg(
 
 def _mutect_get_capspace(wildcards):
     CFG = config["lcr-modules"]["mutect2"]
-    # If this isn't a capture sample, we don't have a capture space, so return nothing
-    if wildcards.seq_type != "capture":
-        return []
     try:
         # Get the appropriate capture space for this sample
         cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
         cap_space = reference_files(cap_space)
-        return cap_space
+        this_space = cap_space
+    # If we are using an older version of the reference workflow, we don't need to do anything
     except NameError:
-        # If we are using an older version of the reference workflow, we don't need to do anything
-        return []
+        this_space = []
+    # If this isn't a capture sample, we don't have a capture space, so return nothing
+    if wildcards.seq_type != "capture":
+        this_space = []
+    return this_space
 
 
 # Launces Mutect2 in matched and unmatched mode
diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk
index f274b8b0..3b101096 100644
--- a/modules/sage/1.0/sage.smk
+++ b/modules/sage/1.0/sage.smk
@@ -123,9 +123,6 @@ def get_chromosomes(wildcards):
 
 def _sage_get_capspace(wildcards):
     CFG = config["lcr-modules"]["sage"]
-    # If this is a genome sample, return a BED file listing all chromosomes
-    if wildcards.seq_type != "capture":
-        this_bed = rules._download_sage_references.output.panel_bed
     try:
         # Get the appropriate capture space for this sample
         this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
@@ -133,6 +130,9 @@ def _sage_get_capspace(wildcards):
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample
         this_bed = rules._download_sage_references.output.panel_bed
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        this_bed = rules._download_sage_references.output.panel_bed
     return this_bed
 
 # Variant calling rule
diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk
index db8b9465..c3cbf117 100755
--- a/modules/strelka/1.1/strelka.smk
+++ b/modules/strelka/1.1/strelka.smk
@@ -121,9 +121,6 @@ def _strelka_get_indel_cli_arg(vcf_in = config["lcr-modules"]["strelka"]["inputs
 
 def _strelka_get_capspace(wildcards):
     CFG = config["lcr-modules"]["strelka"]
-    # If this is a genome sample, return a BED file listing all chromosomes
-    if wildcards.seq_type != "capture":
-        this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
     try:
         # Get the appropriate capture space for this sample
         this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
@@ -131,6 +128,9 @@ def _strelka_get_capspace(wildcards):
     except NameError:
         # If we are using an older version of the reference workflow, use the same region file as the genome sample
         this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        this_bed = str(config["lcr-modules"]["strelka"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
 
     return this_bed
 

From a22a1b87773ba600fb37b71ae1c7e9e883d07697 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Tue, 11 Jan 2022 20:38:23 -0800
Subject: [PATCH 25/38] remove temp fix

---
 modules/starfish/2.0/starfish.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/starfish/2.0/starfish.smk b/modules/starfish/2.0/starfish.smk
index f63d6988..d3213e5a 100644
--- a/modules/starfish/2.0/starfish.smk
+++ b/modules/starfish/2.0/starfish.smk
@@ -112,7 +112,7 @@ rule _starfish_run:
             CFG["dirs"]["inputs"] + "vcf/{{seq_type}}--{{genome_build}}/{{tumour_id}}--{{normal_id}}--{{pair_status}}.{caller}.vcf.gz.tbi", 
             caller = callers
             ),
-        reference = lambda w: ancient(reference_files("genomes/{genome_build}/sdf")) if not w.genome_build in ["hg38-panea", "hg38-nci"] else ancient(reference_files("genomes/hg38/sdf")),
+        reference = ancient(reference_files("genomes/{genome_build}/sdf")),
         starfish_script = CFG["inputs"]["starfish_script"]
     output:
         complete = touch(CFG["dirs"]["starfish"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/starfish.complete"), 

From d5899ef638fd1d10e7e6e8ebf610ba5e26df8ee3 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 12 Jan 2022 22:07:40 -0800
Subject: [PATCH 26/38] bump oncopipe version

---
 oncopipe/oncopipe/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/oncopipe/oncopipe/__version__.py b/oncopipe/oncopipe/__version__.py
index a5673be8..8c861b0f 100644
--- a/oncopipe/oncopipe/__version__.py
+++ b/oncopipe/oncopipe/__version__.py
@@ -6,7 +6,7 @@
 
 __title__ = "oncopipe"
 __description__ = "Functions for running Snakemake modules"
-__version__ = "1.0.11"
+__version__ = "1.0.12"
 __author__ = "Bruno Grande"
 __author_email__ = "bgrande@sfu.ca"
 __license__ = "MIT"

From 68fefd12d60d3209b7067a37107c922939ff855d Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 12 Jan 2022 22:09:54 -0800
Subject: [PATCH 27/38] expose oncopipe version

---
 oncopipe/oncopipe/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/oncopipe/oncopipe/__init__.py b/oncopipe/oncopipe/__init__.py
index fe3d828f..c6360904 100755
--- a/oncopipe/oncopipe/__init__.py
+++ b/oncopipe/oncopipe/__init__.py
@@ -9,6 +9,7 @@
 import collections.abc
 from datetime import datetime
 from collections import defaultdict, namedtuple
+from .__version__ import __version__
 
 import yaml
 import pandas as pd

From 57c66b3747bd24f1bdb45da5c686bba2c193e00d Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 12 Jan 2022 22:12:56 -0800
Subject: [PATCH 28/38] increase min oncopipe version in slms3 submodules

---
 modules/lofreq/1.1/lofreq.smk   | 2 +-
 modules/manta/2.3/manta.smk     | 2 +-
 modules/mutect2/2.0/mutect2.smk | 2 +-
 modules/sage/1.0/sage.smk       | 2 +-
 modules/strelka/1.1/strelka.smk | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
index 895cc662..4a8e57c2 100644
--- a/modules/lofreq/1.1/lofreq.smk
+++ b/modules/lofreq/1.1/lofreq.smk
@@ -15,7 +15,7 @@
 import oncopipe as op
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk
index 7a009701..3a84e077 100644
--- a/modules/manta/2.3/manta.smk
+++ b/modules/manta/2.3/manta.smk
@@ -15,7 +15,7 @@
 import oncopipe as op
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index d5fff996..10c00294 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -17,7 +17,7 @@ import oncopipe as op
 import inspect
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk
index 3b101096..d48343b7 100644
--- a/modules/sage/1.0/sage.smk
+++ b/modules/sage/1.0/sage.smk
@@ -16,7 +16,7 @@
 import oncopipe as op
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
diff --git a/modules/strelka/1.1/strelka.smk b/modules/strelka/1.1/strelka.smk
index c3cbf117..089fa085 100755
--- a/modules/strelka/1.1/strelka.smk
+++ b/modules/strelka/1.1/strelka.smk
@@ -16,7 +16,7 @@
 import oncopipe as op
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version

From 5af516ab62d8470928e3b21322cb761db9819db8 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 12 Jan 2022 22:16:55 -0800
Subject: [PATCH 29/38] harmonize mutect2 no_normal rule

---
 modules/mutect2/2.0/mutect2.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 10c00294..78fac489 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -230,7 +230,7 @@ rule _mutect2_run_no_normal:
         gatk Mutect2 --java-options "-Xmx{params.mem_mb}m" 
         {params.opts} -I {input.tumour_bam} -R {input.fasta} 
         -O {output.vcf} --germline-resource {input.gnomad} 
-        -L {wildcards.chrom} {params.interval_arg} {input.capture_arg}
+        -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg}
         -pon {input.pon} --f1r2-tar-gz {output.f1r2}
         > {log.stdout} 2> {log.stderr}
         """)

From 7bfe4f6c5346fe7bbaa63efada27709f854c6ab0 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Mon, 17 Jan 2022 19:57:34 -0800
Subject: [PATCH 30/38] make capture_space column not required

---
 oncopipe/oncopipe/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/oncopipe/oncopipe/__init__.py b/oncopipe/oncopipe/__init__.py
index 83ed041d..f4afd253 100755
--- a/oncopipe/oncopipe/__init__.py
+++ b/oncopipe/oncopipe/__init__.py
@@ -1837,7 +1837,11 @@ def get_capture_space(module_config, sample_id, genome_build, seq_type, return_e
 
     if len(this_sample) != 1:
         raise AssertionError("Found %s matches when examining the sample table for pair \'%s\' \'%s\' \'%s\'" % (len(sample), sample_id, genome_build, seq_type))
-    panel = this_sample.iloc[0]['capture_space']
+
+    if "capture_space" in this_sample.columns:
+        panel = this_sample.iloc[0]['capture_space']
+    else:
+        panel = "none"
 
     # If this panel is "none" (aka not specified) use the default for this reference genome
     if panel.upper() in (name.upper() for name in ['none', "na", "n/a", ""]):

From 3a3ddc3850eb8757cf334794e1c5939a9e697efe Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Mon, 17 Jan 2022 19:58:57 -0800
Subject: [PATCH 31/38] revert sample table

---
 demo/data/samples.tsv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/demo/data/samples.tsv b/demo/data/samples.tsv
index 5e419226..158f6972 100755
--- a/demo/data/samples.tsv
+++ b/demo/data/samples.tsv
@@ -1,6 +1,6 @@
-sample_id	seq_type	patient_id	tissue_status	genome_build	strand	read_length	capture_space
-TCRBOA7-N-WEX	capture	TCRBOA7	normal	grch37	positive	100	none
-TCRBOA7-T-WEX	capture	TCRBOA7	tumour	grch37	positive	100	none
-TCRBOA7-T-RNA	mrna	TCRBOA7	tumour	grch37	positive	100	none
-TCRBOA7-N-WGS	genome	TCRBOA7	normal	grch37	positive	100	none
-TCRBOA7-T-WGS	genome	TCRBOA7	tumour	grch37	positive	100	none
+sample_id	seq_type	patient_id	tissue_status	genome_build	strand	read_length
+TCRBOA7-N-WEX	capture	TCRBOA7	normal	grch37	positive	100
+TCRBOA7-T-WEX	capture	TCRBOA7	tumour	grch37	positive	100
+TCRBOA7-T-RNA	mrna	TCRBOA7	tumour	grch37	positive	100
+TCRBOA7-N-WGS	genome	TCRBOA7	normal	grch37	positive	100
+TCRBOA7-T-WGS	genome	TCRBOA7	tumour	grch37	positive	100

From bbe0c4b521504d7902a3368f0232d85b7e6f3da6 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 19 Jan 2022 13:40:46 -0800
Subject: [PATCH 32/38] mutect2 run on chrs with candidate positions only

---
 modules/mutect2/2.0/mutect2.smk | 58 +++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 78fac489..4377bf4a 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -15,6 +15,7 @@
 # Import package with useful functions for developing analysis modules
 import oncopipe as op
 import inspect
+import pandas as pd
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
 min_oncopipe_version="1.0.12"
@@ -74,7 +75,20 @@ localrules:
 
 
 ##### RULES #####
-
+def _mutect_get_capspace(wildcards):
+    CFG = config["lcr-modules"]["mutect2"]
+    try:
+        # Get the appropriate capture space for this sample
+        cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
+        cap_space = reference_files(cap_space)
+        this_space = cap_space
+    # If we are using an older version of the reference workflow, we don't need to do anything
+    except NameError:
+        this_space = []
+    # If this isn't a capture sample, we don't have a capture space, so return nothing
+    if wildcards.seq_type != "capture":
+        this_space = []
+    return this_space
 
 # Symlinks the input files into the module results directory (under '00-inputs/')
 rule _mutect2_input_bam:
@@ -97,11 +111,31 @@ rule _mutect2_dummy_positions:
 # Symlink chromosomes used for parallelization
 checkpoint _mutect2_input_chrs:
     input:
-        chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt")
+        candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output),
+        chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"),
+        capture_arg = _mutect_get_capspace
     output:
-        chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt"
+        chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt"
     run:
-        op.absolute_symlink(input.chrs, output.chrs)
+        # obtain list of main chromosomes
+        main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None)
+        main_chrs = main_chrs.iloc[:, 0].astype(str).unique().tolist()
+        #obtain list of chromosomes in candidate positions
+        candidate_chrs = pd.read_csv(input.candidate_positions, comment='#', sep='\t')
+        candidate_chrs = candidate_chrs.iloc[:, 0].astype(str).unique().tolist()
+        # obtain list of chromosomes in the capture space
+        interval_chrs = pd.read_csv(input.capture_arg, comment='@', sep='\t')
+        interval_chrs = interval_chrs.iloc[:, 0].astype(str).unique().tolist()
+        # as there is no capture space for genomes and to handle chr prefixed/non prefixed cases consistently,
+        # return this list as main chromosomes
+        if not interval_chrs: # if list is empty
+            interval_chrs = main_chrs
+        # intersect the three lists to obtain chromosomes present in all
+        intersect_chrs = list(set(main_chrs) & set(candidate_chrs) & set(interval_chrs))
+        # convert list to single-column df
+        intersect_chrs = pd.DataFrame(intersect_chrs).sort_values(0)
+        # write out the file with mutated chromosomes
+        intersect_chrs.to_csv(output.chrs, index=False, header=False)
 
 
 # Retrieves from SM tag from BAM and writes to file
@@ -135,22 +169,6 @@ def _mutect2_get_interval_cli_arg(
     return _mutect2_get_interval_cli_custom
 
 
-def _mutect_get_capspace(wildcards):
-    CFG = config["lcr-modules"]["mutect2"]
-    try:
-        # Get the appropriate capture space for this sample
-        cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
-        cap_space = reference_files(cap_space)
-        this_space = cap_space
-    # If we are using an older version of the reference workflow, we don't need to do anything
-    except NameError:
-        this_space = []
-    # If this isn't a capture sample, we don't have a capture space, so return nothing
-    if wildcards.seq_type != "capture":
-        this_space = []
-    return this_space
-
-
 # Launces Mutect2 in matched and unmatched mode
 rule _mutect2_run_matched_unmatched:
     input:

From 869302f8c98701a7ac5e67f0c61521f1faa92da2 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 19 Jan 2022 19:54:10 -0800
Subject: [PATCH 33/38] ensure genome compatibility of new updates

---
 modules/mutect2/2.0/mutect2.smk | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index 4377bf4a..cdf0c5af 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -84,10 +84,10 @@ def _mutect_get_capspace(wildcards):
         this_space = cap_space
     # If we are using an older version of the reference workflow, we don't need to do anything
     except NameError:
-        this_space = []
+        this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
     # If this isn't a capture sample, we don't have a capture space, so return nothing
     if wildcards.seq_type != "capture":
-        this_space = []
+        this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
     return this_space
 
 # Symlinks the input files into the module results directory (under '00-inputs/')
@@ -126,10 +126,6 @@ checkpoint _mutect2_input_chrs:
         # obtain list of chromosomes in the capture space
         interval_chrs = pd.read_csv(input.capture_arg, comment='@', sep='\t')
         interval_chrs = interval_chrs.iloc[:, 0].astype(str).unique().tolist()
-        # as there is no capture space for genomes and to handle chr prefixed/non prefixed cases consistently,
-        # return this list as main chromosomes
-        if not interval_chrs: # if list is empty
-            interval_chrs = main_chrs
         # intersect the three lists to obtain chromosomes present in all
         intersect_chrs = list(set(main_chrs) & set(candidate_chrs) & set(interval_chrs))
         # convert list to single-column df

From 260d53d6cf2fc7fb78e975f50fa7d64e60edfe57 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 19 Jan 2022 19:55:53 -0800
Subject: [PATCH 34/38] allow to specify local bed file for capture

---
 workflows/reference_files/2.4/reference_files.smk      |  8 ++++++--
 .../reference_files/2.4/reference_files_header.smk     | 10 ++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/workflows/reference_files/2.4/reference_files.smk b/workflows/reference_files/2.4/reference_files.smk
index 7962f0d2..f3e3a831 100644
--- a/workflows/reference_files/2.4/reference_files.smk
+++ b/workflows/reference_files/2.4/reference_files.smk
@@ -474,6 +474,7 @@ rule sort_and_pad_capspace:
        capture_bed = rules.get_capspace_bed_download.output.capture_bed,
        fai = rules.index_genome_fasta.output.fai
     output:
+        intermediate_bed = temp("genomes/{genome_build}/capture_space/{capture_space}.intermediate.bed"),
         padded_bed = "genomes/{genome_build}/capture_space/{capture_space}.padded.bed"
     params:
         padding_size = config['capture_params']['padding_size']  # Default to 200. I would be warry of changing
@@ -481,8 +482,11 @@ rule sort_and_pad_capspace:
         "genomes/{genome_build}/capture_space/{capture_space}.padded.bed.log"
     conda: CONDA_ENVS["bedtools"]
     shell:
-        "bedtools slop -b {params.padding_size} -i {input.capture_bed} -g {input.fai} | bedtools sort | bedtools merge > {output.padded_bed} 2> {log}"
-
+        op.as_one_line("""
+        cat {input.fai} | cut -f 1-2 | perl -ane 'print "$F[0]\\t0\\t$F[1]\\n"' | bedtools intersect -wa -a {input.capture_bed} -b stdin > {output.intermediate_bed}
+            &&
+        bedtools slop -b {params.padding_size} -i {output.intermediate_bed} -g {input.fai} | bedtools sort | bedtools merge > {output.padded_bed} 2> {log}
+        """)
 
 rule check_capspace_contigs:
     input:
diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk
index 9c1581d3..20479bd6 100644
--- a/workflows/reference_files/2.4/reference_files_header.smk
+++ b/workflows/reference_files/2.4/reference_files_header.smk
@@ -530,10 +530,16 @@ rule download_capspace_bed:
     log:
         "downloads/capture_space/{capture_space}.{genome_build}.bed.log"
     params:
-        url = lambda w: config["capture_space"][w.capture_space]["capture_bed_url"],
+        path = lambda w: config["capture_space"][w.capture_space]["capture_bed_file"] if "capture_bed_file" in config["capture_space"][w.capture_space] else config["capture_space"][w.capture_space]["capture_bed_url"],
         provider = lambda w: config["capture_space"][w.capture_space]["provider"]
     shell:
-        "curl -L {params.url} > {output.capture_bed} 2> {log}"
+        op.as_one_line("""
+        if [ -e {params.path} ]; then
+            cat {params.path} > {output.capture_bed} 2> {log};
+        else
+            curl -L {params.path} > {output.capture_bed} 2> {log};
+        fi
+        """)
 
 rule add_remove_chr_prefix_bed:
     input:

From d8cc389b934f8f0c17d001ed19cecc611f2d8c53 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Wed, 19 Jan 2022 20:05:01 -0800
Subject: [PATCH 35/38] add instructions on adding custom capture space

---
 workflows/reference_files/2.4/config/default.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/workflows/reference_files/2.4/config/default.yaml b/workflows/reference_files/2.4/config/default.yaml
index a3232434..1ad75c50 100644
--- a/workflows/reference_files/2.4/config/default.yaml
+++ b/workflows/reference_files/2.4/config/default.yaml
@@ -74,6 +74,12 @@ capture_space:
         provider: "ensembl"
         default: "true"
         capture_bed_url: "https://www.bcgsc.ca/downloads/lcr-modules/genome_fastas/capture_bed/grch37_all_genes.canonical.sort.bed4"
+    # to add custom capture space, follow this example and fill in dictionary keys
+    #name_of_capture_panel:
+        #genome: "grch37" or "hg38" as example
+        #provider: "ensembl" or "ucsc"
+        #capture_bed_url: here provide a link enclosed in "" to download bed file for capture panel if it is available in internet
+        #capture_bed_file: this key is optional and in "" you can specify path to a local bed file with capture panel
 
 capture_params:
     padding_size: "200"

From cfdcf96847c172259c7b6198329d5fd4d800565d Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Fri, 21 Jan 2022 10:52:36 -0800
Subject: [PATCH 36/38] revert name of checkpoint output file

---
 modules/mutect2/2.0/mutect2.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index cdf0c5af..f939c0ef 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -115,7 +115,7 @@ checkpoint _mutect2_input_chrs:
         chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"),
         capture_arg = _mutect_get_capspace
     output:
-        chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt"
+        chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt"
     run:
         # obtain list of main chromosomes
         main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None)

From 689b5ee42887f4e044bd8c183c576c34ad90e297 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Fri, 21 Jan 2022 12:56:06 -0800
Subject: [PATCH 37/38] revert the revert back in mutect2 checkpoint

---
 modules/mutect2/2.0/mutect2.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
index f939c0ef..cdf0c5af 100644
--- a/modules/mutect2/2.0/mutect2.smk
+++ b/modules/mutect2/2.0/mutect2.smk
@@ -115,7 +115,7 @@ checkpoint _mutect2_input_chrs:
         chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"),
         capture_arg = _mutect_get_capspace
     output:
-        chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt"
+        chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt"
     run:
         # obtain list of main chromosomes
         main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None)

From 1a2b62e5987b02dc13df415b18801214bca2fd12 Mon Sep 17 00:00:00 2001
From: Kdreval <k.dreval@gmail.com>
Date: Fri, 21 Jan 2022 12:57:09 -0800
Subject: [PATCH 38/38] allow using local fasta files in reference_files

---
 .../2.4/reference_files_header.smk               | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/workflows/reference_files/2.4/reference_files_header.smk b/workflows/reference_files/2.4/reference_files_header.smk
index 20479bd6..d2326396 100644
--- a/workflows/reference_files/2.4/reference_files_header.smk
+++ b/workflows/reference_files/2.4/reference_files_header.smk
@@ -173,16 +173,22 @@ for chrom_map_file in CHROM_MAPPINGS_FILES:
 
 
 rule download_genome_fasta:
-    output: 
+    output:
         fasta = "downloads/genome_fasta/{genome_build}.fa"
-    log: 
+    log:
         "downloads/genome_fasta/{genome_build}.fa.log"
     wildcard_constraints:
         genome_build = ".+(?<!masked)"
-    params: 
-        url = lambda w: config["genome_builds"][w.genome_build]["genome_fasta_url"]
+    params:
+        path = lambda w: config["genome_builds"][w.genome_build]["genome_fasta_file"] if "genome_fasta_file" in config["genome_builds"][w.genome_build] else config["genome_builds"][w.genome_build]["genome_fasta_url"],
     shell:
-        "curl -L {params.url} > {output.fasta} 2> {log}"
+        op.as_one_line("""
+        if [ -e {params.path} ]; then
+            cat {params.path} > {output.fasta} 2> {log};
+        else
+            curl -L {params.path} > {output.fasta} 2> {log};
+        fi
+        """)
 
 rule download_masked_genome_fasta:
     output: