Skip to content

Commit

Permalink
Merge pull request #192 from LCR-BCCRC/crushton-targeted
Browse files Browse the repository at this point in the history
Added support for capture data to reference workflow + SLMS-3
  • Loading branch information
Kdreval authored Jan 21, 2022
2 parents a8a112c + 1a2b62e commit 82439d6
Show file tree
Hide file tree
Showing 14 changed files with 483 additions and 58 deletions.
29 changes: 25 additions & 4 deletions modules/lofreq/1.1/lofreq.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import oncopipe as op

# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
min_oncopipe_version="1.0.11"
min_oncopipe_version="1.0.12"
import pkg_resources
try:
from packaging import version
Expand Down Expand Up @@ -58,6 +58,27 @@ localrules:
_lofreq_all,
_lofreq_link_to_preprocessed

def _lofreq_get_capspace(wildcards):
CFG=config["lcr-modules"]["lofreq"]
default_bed = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
if str(wildcards.seq_type) in CFG['switches']['regions_bed'].keys():
custom_bed = CFG['switches']['regions_bed'][wildcards.seq_type]
else:
custom_bed = default_bed
try:
if "tumour_id" in wildcards.keys():
# Get the appropriate capture space for this sample
this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed")
else:
this_bed = op.get_capture_space(CFG, wildcards.normal_id, wildcards.genome_build, wildcards.seq_type, "bed")
this_bed = reference_files(this_bed)
except NameError:
# If we are using an older version of the reference workflow, use the same region file as the genome sample
this_bed = custom_bed if custom_bed else default_bed
# If this is a genome sample, return a BED file listing all chromosomes
if wildcards.seq_type != "capture":
return custom_bed if custom_bed else default_bed
return this_bed

##### RULES #####

Expand All @@ -84,7 +105,7 @@ rule _lofreq_preprocess_normal:
normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
bed = _lofreq_get_capspace
output:
preprocessing_start = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/preprocessing.started",
vcf_relaxed = CFG["dirs"]["lofreq_normal"] + "{seq_type}--{genome_build}/{normal_id}/normal_relaxed.vcf.gz",
Expand Down Expand Up @@ -164,7 +185,7 @@ rule _lofreq_run_tumour_unmatched:
vcf_relaxed = str(rules._lofreq_link_to_preprocessed.output.vcf_relaxed),
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
bed = _lofreq_get_capspace
output:
vcf_snvs_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz",
vcf_indels_filtered = CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.indels.vcf.gz",
Expand Down Expand Up @@ -202,7 +223,7 @@ rule _lofreq_run_tumour_matched:
normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
dbsnp = reference_files("genomes/{genome_build}/variation/dbsnp.common_all-151.vcf.gz"), #in our experience, this filter doesn't remove as many SNPs as one would expect
bed = op.switch_on_wildcard("seq_type", CFG["switches"]["regions_bed"])
bed = _lofreq_get_capspace
output:
vcf_relaxed = temp(CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/normal_relaxed.vcf.gz"),
vcf_snvs_filtered = temp(CFG["dirs"]["lofreq_somatic"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/somatic_final_minus-dbsnp.snvs.vcf.gz"),
Expand Down
25 changes: 20 additions & 5 deletions modules/manta/2.3/manta.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# Original Snakefile Author: Bruno Grande
# Module Author: Bruno Grande
# Additional Contributors: N/A
# Additional Contributors: Chris Rushton


##### SETUP #####
Expand All @@ -15,7 +15,7 @@
import oncopipe as op

# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
min_oncopipe_version="1.0.11"
min_oncopipe_version="1.0.12"
import pkg_resources
try:
from packaging import version
Expand Down Expand Up @@ -86,14 +86,28 @@ rule _manta_index_bed:
tabix {output.bedz}
""")

def _manta_get_capspace(wildcards):
CFG = config["lcr-modules"]["manta"]
try:
# Get the appropriate capture space for this sample
this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
this_bed = reference_files(this_bed)
except NameError:
# If we are using an older version of the reference workflow, use the same region file as the genome sample
this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
# If this is a genome sample, return a BED file listing all chromosomes
if wildcards.seq_type != "capture":
this_bed = str(config["lcr-modules"]["manta"]["dirs"]["chrom_bed"] + "{genome_build}.main_chroms.bed.gz")
return this_bed

# Configures the manta workflow with the input BAM files and reference FASTA file.
rule _manta_configure_paired:
input:
tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]),
bedz = str(rules._manta_index_bed.output.bedz)
bedz = _manta_get_capspace
output:
runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
log:
Expand All @@ -120,7 +134,7 @@ rule _manta_configure_unpaired:
tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"),
config = op.switch_on_wildcard("seq_type", CFG["switches"]["manta_config"]),
bedz = str(rules._manta_index_bed.output.bedz)
bedz = _manta_get_capspace
output:
runwf = CFG["dirs"]["manta"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/runWorkflow.py"
log:
Expand All @@ -135,7 +149,7 @@ rule _manta_configure_unpaired:
CFG["conda_envs"]["manta"]
shell:
op.as_one_line("""
configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {input.bedz}
configManta.py {params.opts} --referenceFasta {input.fasta} --callRegions {params.bedz}
--runDir "$(dirname {output.runwf})" {params.tumour_bam_arg_name} {input.tumour_bam}
--config {input.config} > {log.stdout} 2> {log.stderr}
""")
Expand Down Expand Up @@ -291,6 +305,7 @@ def _manta_predict_output(wildcards):
return outputs_with_bedpe + outputs_without_bedpe



# Generates the target symlinks for each run depending on the Manta output VCF files
rule _manta_dispatch:
input:
Expand Down
56 changes: 44 additions & 12 deletions modules/mutect2/2.0/mutect2.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
# Import package with useful functions for developing analysis modules
import oncopipe as op
import inspect
import pandas as pd

# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
min_oncopipe_version="1.0.11"
min_oncopipe_version="1.0.12"
import pkg_resources
try:
from packaging import version
Expand Down Expand Up @@ -74,7 +75,20 @@ localrules:


##### RULES #####

def _mutect_get_capspace(wildcards):
CFG = config["lcr-modules"]["mutect2"]
try:
# Get the appropriate capture space for this sample
cap_space = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "interval_list")
cap_space = reference_files(cap_space)
this_space = cap_space
# If we are using an older version of the reference workflow, we don't need to do anything
except NameError:
this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
# If this isn't a capture sample, we don't have a capture space, so return nothing
if wildcards.seq_type != "capture":
this_space = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.bed")
return this_space

# Symlinks the input files into the module results directory (under '00-inputs/')
rule _mutect2_input_bam:
Expand All @@ -97,11 +111,27 @@ rule _mutect2_dummy_positions:
# Symlink chromosomes used for parallelization
checkpoint _mutect2_input_chrs:
input:
chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt")
candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output),
chrs = reference_files("genomes/{genome_build}/genome_fasta/main_chromosomes.txt"),
capture_arg = _mutect_get_capspace
output:
chrs = CFG["dirs"]["inputs"] + "chroms/{genome_build}/main_chromosomes.txt"
chrs = CFG["dirs"]["inputs"] + "chroms/{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/mutated_chromosomes.txt"
run:
op.absolute_symlink(input.chrs, output.chrs)
# obtain list of main chromosomes
main_chrs = pd.read_csv(input.chrs, comment='#', sep='\t', header=None)
main_chrs = main_chrs.iloc[:, 0].astype(str).unique().tolist()
#obtain list of chromosomes in candidate positions
candidate_chrs = pd.read_csv(input.candidate_positions, comment='#', sep='\t')
candidate_chrs = candidate_chrs.iloc[:, 0].astype(str).unique().tolist()
# obtain list of chromosomes in the capture space
interval_chrs = pd.read_csv(input.capture_arg, comment='@', sep='\t')
interval_chrs = interval_chrs.iloc[:, 0].astype(str).unique().tolist()
# intersect the three lists to obtain chromosomes present in all
intersect_chrs = list(set(main_chrs) & set(candidate_chrs) & set(interval_chrs))
# convert list to single-column df
intersect_chrs = pd.DataFrame(intersect_chrs).sort_values(0)
# write out the file with mutated chromosomes
intersect_chrs.to_csv(output.chrs, index=False, header=False)


# Retrieves from SM tag from BAM and writes to file
Expand All @@ -123,13 +153,13 @@ rule _mutect2_get_sm:
# The first function loads the wildcard-containing file path and additional args from the config.
# The second replaces wildcards with those used in the rule.
def _mutect2_get_interval_cli_arg(
vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"],
vcf_in = config["lcr-modules"]["mutect2"]["inputs"]["candidate_positions"],
interval_arg_in = config["lcr-modules"]["mutect2"]["options"]["mutect2_interval_rules"]
):
def _mutect2_get_interval_cli_custom(wildcards, input):
if vcf_in:
param = f"-L {input.candidate_positions} {interval_arg_in}"
else:
else:
param = ""
return param
return _mutect2_get_interval_cli_custom
Expand All @@ -145,7 +175,8 @@ rule _mutect2_run_matched_unmatched:
gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz"),
normal_sm = CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{normal_id}_sm.txt",
pon = reference_files("genomes/{genome_build}/gatk/mutect2_pon.{genome_build}.vcf.gz"),
candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output)
candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output),
capture_arg = _mutect_get_capspace
output:
vcf = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz"),
tbi = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz.tbi"),
Expand All @@ -172,7 +203,7 @@ rule _mutect2_run_matched_unmatched:
-I {input.tumour_bam} -I {input.normal_bam}
-R {input.fasta} -normal "$(cat {input.normal_sm})" -O {output.vcf}
--germline-resource {input.gnomad}
-L {wildcards.chrom} {params.interval_arg}
-L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg}
-pon {input.pon} --f1r2-tar-gz {output.f1r2}
> {log.stdout} 2> {log.stderr}
""")
Expand All @@ -186,7 +217,8 @@ rule _mutect2_run_no_normal:
dict = reference_files("genomes/{genome_build}/genome_fasta/genome.dict"),
gnomad = reference_files("genomes/{genome_build}/variation/af-only-gnomad.{genome_build}.vcf.gz"),
pon = reference_files("genomes/{genome_build}/gatk/mutect2_pon.{genome_build}.vcf.gz"),
candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output)
candidate_positions = CFG["inputs"]["candidate_positions"] if CFG["inputs"]["candidate_positions"] else str(rules._mutect2_dummy_positions.output),
capture_arg = _mutect_get_capspace
output:
vcf = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz"),
tbi = temp(CFG["dirs"]["mutect2"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/chromosomes/{chrom}.output.vcf.gz.tbi"),
Expand All @@ -200,7 +232,7 @@ rule _mutect2_run_no_normal:
params:
mem_mb = lambda wildcards, resources: int(resources.mem_mb * 0.8),
opts = CFG["options"]["mutect2_run"],
interval_arg = _mutect2_get_interval_cli_arg()
interval_arg = _mutect2_get_interval_cli_arg()
conda:
CFG["conda_envs"]["gatk"]
threads:
Expand All @@ -212,7 +244,7 @@ rule _mutect2_run_no_normal:
gatk Mutect2 --java-options "-Xmx{params.mem_mb}m"
{params.opts} -I {input.tumour_bam} -R {input.fasta}
-O {output.vcf} --germline-resource {input.gnomad}
-L {wildcards.chrom} {params.interval_arg}
-L {wildcards.chrom} {params.interval_arg} -L {input.capture_arg}
-pon {input.pon} --f1r2-tar-gz {output.f1r2}
> {log.stdout} 2> {log.stderr}
""")
Expand Down
22 changes: 18 additions & 4 deletions modules/sage/1.0/sage.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import oncopipe as op

# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe
min_oncopipe_version="1.0.11"
min_oncopipe_version="1.0.12"
import pkg_resources
try:
from packaging import version
Expand Down Expand Up @@ -121,15 +121,29 @@ def get_chromosomes(wildcards):
chromosomes= ",".join(chromosomes)
return chromosomes

def _sage_get_capspace(wildcards):
CFG = config["lcr-modules"]["sage"]
try:
# Get the appropriate capture space for this sample
this_bed = op.get_capture_space(CFG, wildcards.tumour_id, wildcards.genome_build, wildcards.seq_type, "bed.gz")
this_bed = reference_files(this_bed)
except NameError:
# If we are using an older version of the reference workflow, use the same region file as the genome sample
this_bed = rules._download_sage_references.output.panel_bed
# If this is a genome sample, return a BED file listing all chromosomes
if wildcards.seq_type != "capture":
this_bed = rules._download_sage_references.output.panel_bed
return this_bed

# Variant calling rule
rule _run_sage:
input:
tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam",
normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam",
fasta = str(rules._input_references.output.genome_fa),
hotspots = str(rules._download_sage_references.output.hotspots),
panel_bed = str(rules._download_sage_references.output.panel_bed),
high_conf_bed = str(rules._download_sage_references.output.high_conf_bed)
hotspots = rules._download_sage_references.output.hotspots,
high_conf_bed = str(rules._download_sage_references.output.high_conf_bed),
panel_bed = _sage_get_capspace
output:
vcf = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf"),
vcf_gz = temp(CFG["dirs"]["sage"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}--{pair_status}/{tumour_id}--{normal_id}--{pair_status}.vcf.gz")
Expand Down
Loading

0 comments on commit 82439d6

Please sign in to comment.