Merge pull request #192 from LCR-BCCRC/crushton-targeted

Added support for capture data to reference workflow + SLMS-3
LCR-BCCRC · Jan 21, 2022 · 82439d6 · 82439d6
2 parents a8a112c + 1a2b62e
commit 82439d6
Show file tree

Hide file tree

Showing 14 changed files with 483 additions and 58 deletions.
diff --git a/modules/lofreq/1.1/lofreq.smk b/modules/lofreq/1.1/lofreq.smk
@@ -15,7 +15,7 @@
 import oncopipe as op
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
@@ -58,6 +58,27 @@ localrules:
     _lofreq_all,
     _lofreq_link_to_preprocessed
 
+def _lofreq_get_capspace(wildcards):
+    CFG=config["lcr-modules"]["lofreq"]
+    default_bed = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
+    if str(wildcards.seq_type) in CFG['switches']['regions_bed'].keys():
+        custom_bed = CFG['switches']['regions_bed'][wildcards.seq_type]
+    else:
+        custom_bed = default_bed
+    try:
+        if "tumour_id" in wildcards.keys():
+        # Get the appropriate capture space for this sample
+            this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed")
+        else:
+            this_bed = op.get_capture_space(CFG, wildcards.normal_id, wildcards.genome_build, wildcards.seq_type, "bed")
+        this_bed = reference_files(this_bed)
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample 
+        this_bed = custom_bed if custom_bed else default_bed
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        return custom_bed if custom_bed else default_bed
+    return this_bed
 
 ##### RULES #####
 
@@ -84,7 +105,7 @@ rule _lofreq_preprocess_normal:
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
-        bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
+        bed = _lofreq_get_capspace
     output:
         preprocessing_start = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/preprocessing.started",
         vcf_relaxed = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/normal_relaxed.vcf.gz",
@@ -164,7 +185,7 @@ rule _lofreq_run_tumour_unmatched:
         vcf_relaxed = str(rules._lofreq_link_to_preprocessed.output.vcf_relaxed),
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
-        bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
+        bed = _lofreq_get_capspace
     output:
         vcf_snvs_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz",
         vcf_indels_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.indels.vcf.gz",
@@ -202,7 +223,7 @@ rule _lofreq_run_tumour_matched:
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
-        bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
+        bed = _lofreq_get_capspace
     output:
         vcf_relaxed = temp(CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/normal_relaxed.vcf.gz"),
         vcf_snvs_filtered = temp(CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz"),

diff --git a/modules/manta/2.3/manta.smk b/modules/manta/2.3/manta.smk
@@ -6,7 +6,7 @@
 
 # Original Snakefile Author:    Bruno Grande
 # Module Author:                Bruno Grande
-# Additional Contributors:      N/A
+# Additional Contributors:      Chris Rushton
 
 
 ##### SETUP #####
@@ -15,7 +15,7 @@
 import oncopipe as op
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
@@ -86,14 +86,28 @@ rule _manta_index_bed:
         tabix {output.bedz}
         """)
 
+def _manta_get_capspace(wildcards):
+    CFG = config["lcr-modules"]["manta"]
+    try:
+        # Get the appropriate capture space for this sample
+        this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = reference_files(this_bed)
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample
+        this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
+    return this_bed
+
 # Configures the manta workflow with the input BAM files and reference FASTA file.
 rule _manta_configure_paired:
     input:
         tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]),
-        bedz = str(rules._manta_index_bed.output.bedz)
+        bedz = _manta_get_capspace
     output:
         runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
     log:
@@ -120,7 +134,7 @@ rule _manta_configure_unpaired:
         tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
         fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
         config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]),
-        bedz = str(rules._manta_index_bed.output.bedz)
+        bedz = _manta_get_capspace
     output:
         runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
     log:
@@ -135,7 +149,7 @@ rule _manta_configure_unpaired:
         CFG["conda_envs"]["manta"]
     shell:
         op.as_one_line("""
-        configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {input.bedz}
+        configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {params.bedz}
         --runDir "$(dirname {output.runwf})" {params.tumour_bam_arg_name} {input.tumour_bam}
         --config {input.config} > {log.stdout} 2> {log.stderr}
         """)
@@ -291,6 +305,7 @@ def _manta_predict_output(wildcards):
     return outputs_with_bedpe + outputs_without_bedpe
 
 
+
 # Generates the target symlinks for each run depending on the Manta output VCF files
 rule _manta_dispatch:
     input:

diff --git a/modules/mutect2/2.0/mutect2.smk b/modules/mutect2/2.0/mutect2.smk
@@ -15,9 +15,10 @@
 # Import package with useful functions for developing analysis modules
 import oncopipe as op
 import inspect
+import pandas as pd
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
@@ -74,7 +75,20 @@ localrules:
 
 
 ##### RULES #####
-
+def _mutect_get_capspace(wildcards):
+    CFG = config["lcr-modules"]["mutect2"]
+    try:
+        # Get the appropriate capture space for this sample
+        cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
+        cap_space = reference_files(cap_space)
+        this_space = cap_space
+    # If we are using an older version of the reference workflow, we don't need to do anything
+    except NameError:
+        this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
+    # If this isn't a capture sample, we don't have a capture space, so return nothing
+    if wildcards.seq_type != "capture":
+        this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
+    return this_space
 
 # Symlinks the input files into the module results directory (under '00-inputs/')
 rule _mutect2_input_bam:
@@ -97,11 +111,27 @@ rule _mutect2_dummy_positions:
 # Symlink chromosomes used for parallelization
 checkpoint _mutect2_input_chrs:
     input:
-        chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt")
+        candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output),
+        chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"),
+        capture_arg = _mutect_get_capspace
     output:
-        chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt"
+        chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt"
     run:
-        op.absolute_symlink(input.chrs, output.chrs)
+        # obtain list of main chromosomes
+        main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None)
+        main_chrs = main_chrs.iloc[:, 0].astype(str).unique().tolist()
+        #obtain list of chromosomes in candidate positions
+        candidate_chrs = pd.read_csv(input.candidate_positions, comment='#', sep='\t')
+        candidate_chrs = candidate_chrs.iloc[:, 0].astype(str).unique().tolist()
+        # obtain list of chromosomes in the capture space
+        interval_chrs = pd.read_csv(input.capture_arg, comment='@', sep='\t')
+        interval_chrs = interval_chrs.iloc[:, 0].astype(str).unique().tolist()
+        # intersect the three lists to obtain chromosomes present in all
+        intersect_chrs = list(set(main_chrs) & set(candidate_chrs) & set(interval_chrs))
+        # convert list to single-column df
+        intersect_chrs = pd.DataFrame(intersect_chrs).sort_values(0)
+        # write out the file with mutated chromosomes
+        intersect_chrs.to_csv(output.chrs, index=False, header=False)
 
 
 # Retrieves from SM tag from BAM and writes to file
@@ -123,13 +153,13 @@ rule _mutect2_get_sm:
 # The first function loads the wildcard-containing file path and additional args from the config. 
 # The second replaces wildcards with those used in the rule. 
 def _mutect2_get_interval_cli_arg(
-    vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"], 
+    vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"],
     interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"]
 ):
     def _mutect2_get_interval_cli_custom(wildcards, input):
         if vcf_in:
             param = f"-L {input.candidate_positions} {interval_arg_in}"
-        else: 
+        else:
             param = ""
         return param
     return _mutect2_get_interval_cli_custom
@@ -145,7 +175,8 @@ rule _mutect2_run_matched_unmatched:
         gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz"),
         normal_sm = CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{normal_id}_sm.txt", 
         pon = reference_files("genomes/{genome_build}/gatk/mutect2_pon.{genome_build}.vcf.gz"), 
-        candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output)
+        candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output),
+        capture_arg = _mutect_get_capspace
     output:
         vcf = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz"),
         tbi = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz.tbi"),
@@ -172,7 +203,7 @@ rule _mutect2_run_matched_unmatched:
         -I {input.tumour_bam} -I {input.normal_bam}
         -R {input.fasta} -normal "$(cat {input.normal_sm})" -O {output.vcf}
         --germline-resource {input.gnomad} 
-        -L {wildcards.chrom} {params.interval_arg} 
+        -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg}
         -pon {input.pon} --f1r2-tar-gz {output.f1r2}
         > {log.stdout} 2> {log.stderr}
         """)
@@ -186,7 +217,8 @@ rule _mutect2_run_no_normal:
         dict = reference_files("genomes/{genome_build}/genome_fasta/genome.dict"),
         gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz"),
         pon = reference_files("genomes/{genome_build}/gatk/mutect2_pon.{genome_build}.vcf.gz"), 
-        candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output)
+        candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output),
+        capture_arg = _mutect_get_capspace
     output:
         vcf = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz"),
         tbi = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz.tbi"),
@@ -200,7 +232,7 @@ rule _mutect2_run_no_normal:
     params:
         mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8),
         opts = CFG["options"]["mutect2_run"], 
-        interval_arg = _mutect2_get_interval_cli_arg() 
+        interval_arg = _mutect2_get_interval_cli_arg()
     conda:
         CFG["conda_envs"]["gatk"]
     threads:
@@ -212,7 +244,7 @@ rule _mutect2_run_no_normal:
         gatk Mutect2 --java-options "-Xmx{params.mem_mb}m" 
         {params.opts} -I {input.tumour_bam} -R {input.fasta} 
         -O {output.vcf} --germline-resource {input.gnomad} 
-        -L {wildcards.chrom} {params.interval_arg}
+        -L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg}
         -pon {input.pon} --f1r2-tar-gz {output.f1r2}
         > {log.stdout} 2> {log.stderr}
         """)

diff --git a/modules/sage/1.0/sage.smk b/modules/sage/1.0/sage.smk
@@ -16,7 +16,7 @@
 import oncopipe as op
 
 # Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
-min_oncopipe_version="1.0.11"
+min_oncopipe_version="1.0.12"
 import pkg_resources
 try:
     from packaging import version
@@ -121,15 +121,29 @@ def get_chromosomes(wildcards):
     chromosomes= ",".join(chromosomes)    
     return chromosomes
 
+def _sage_get_capspace(wildcards):
+    CFG = config["lcr-modules"]["sage"]
+    try:
+        # Get the appropriate capture space for this sample
+        this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
+        this_bed = reference_files(this_bed)
+    except NameError:
+        # If we are using an older version of the reference workflow, use the same region file as the genome sample
+        this_bed = rules._download_sage_references.output.panel_bed
+    # If this is a genome sample, return a BED file listing all chromosomes
+    if wildcards.seq_type != "capture":
+        this_bed = rules._download_sage_references.output.panel_bed
+    return this_bed
+
 # Variant calling rule
 rule _run_sage:
     input:
         tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
         normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
         fasta = str(rules._input_references.output.genome_fa),
-        hotspots = str(rules._download_sage_references.output.hotspots),
-        panel_bed = str(rules._download_sage_references.output.panel_bed),
-        high_conf_bed = str(rules._download_sage_references.output.high_conf_bed)
+        hotspots = rules._download_sage_references.output.hotspots,
+        high_conf_bed = str(rules._download_sage_references.output.high_conf_bed),
+        panel_bed = _sage_get_capspace
     output:
         vcf = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf"),
         vcf_gz = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf.gz")