-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #171 from LCR-BCCRC/battenberg_addition
Battenberg module updates
- Loading branch information
Showing
10 changed files
with
804 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,297 @@ | ||
#!/usr/bin/env snakemake | ||
|
||
|
||
##### ATTRIBUTION ##### | ||
|
||
|
||
# Original Author: Ryan Morin | ||
# Module Author: Ryan Morin | ||
# Contributors: N/A | ||
|
||
##### SETUP ##### | ||
|
||
|
||
# Import package with useful functions for developing analysis modules | ||
import oncopipe as op | ||
import glob | ||
|
||
# Check that the oncopipe dependency is up-to-date. Add all the following lines to any module that uses new features in oncopipe | ||
min_oncopipe_version="1.0.11" | ||
import pkg_resources | ||
try: | ||
from packaging import version | ||
except ModuleNotFoundError: | ||
sys.exit("The packaging module dependency is missing. Please install it ('pip install packaging') and ensure you are using the most up-to-date oncopipe version") | ||
|
||
# To avoid this we need to add the "packaging" module as a dependency for LCR-modules or oncopipe | ||
|
||
current_version = pkg_resources.get_distribution("oncopipe").version | ||
if version.parse(current_version) < version.parse(min_oncopipe_version): | ||
logger.warning( | ||
'\x1b[0;31;40m' + f'ERROR: oncopipe version installed: {current_version}' | ||
"\n" f"ERROR: This module requires oncopipe version >= {min_oncopipe_version}. Please update oncopipe in your environment" + '\x1b[0m' | ||
) | ||
sys.exit("Instructions for updating to the current version of oncopipe are available at https://lcr-modules.readthedocs.io/en/latest/ (use option 2)") | ||
|
||
# End of dependency checking section | ||
|
||
# Setup module and store module-specific configuration in `CFG` | ||
# `CFG` is a shortcut to `config["lcr-modules"]["battenberg"]` | ||
CFG = op.setup_module( | ||
name = "battenberg", | ||
version = "1.2", | ||
subdirectories = ["inputs", "infer_sex","battenberg", "outputs"], | ||
) | ||
|
||
#set variable for prepending to PATH based on config | ||
SCRIPT_PATH = CFG['inputs']['src_dir'] | ||
#this is used in place of the shell.prefix() because that was not working consistently. This is not ideal. | ||
|
||
#this preserves the variable when using lambda functions | ||
_battenberg_CFG = CFG | ||
|
||
# Define rules to be run locally when using a compute cluster | ||
localrules: | ||
_battenberg_all | ||
|
||
VERSION_MAP = { | ||
"hg19": "grch37", | ||
"grch37": "grch37", | ||
"hs37d5": "grch37", | ||
"hg38": "hg38", | ||
"grch38": "hg38", | ||
"grch38-legacy": "hg38" | ||
|
||
} | ||
|
||
##### RULES ##### | ||
|
||
# Downloads the reference files into the module results directory (under '00-inputs/') from https://www.bcgsc.ca/downloads/morinlab/reference/ . | ||
rule _battenberg_get_reference: | ||
output: | ||
battenberg_impute = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_impute_v3"), | ||
impute_info = CFG["dirs"]["inputs"] + "reference/{genome_build}/impute_info.txt", | ||
probloci = CFG["dirs"]["inputs"] + "reference/{genome_build}/probloci.txt.gz", | ||
battenberg_wgs_replic_correction = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_wgs_replic_correction_1000g_v3"), | ||
battenberg_gc_correction = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_wgs_gc_correction_1000g_v3"), | ||
genomesloci = directory(CFG["dirs"]["inputs"] + "reference/{genome_build}/battenberg_1000genomesloci2012_v3") | ||
params: | ||
url = "https://www.bcgsc.ca/downloads/morinlab/reference", | ||
alt_build = lambda w: VERSION_MAP[w.genome_build], | ||
folder = CFG["dirs"]["inputs"] + "reference/{genome_build}", | ||
build = "{genome_build}", | ||
PATH = CFG['inputs']['src_dir'] | ||
resources: | ||
**CFG["resources"]["reference"] | ||
threads: | ||
CFG["threads"]["reference"] | ||
shell: | ||
op.as_one_line(""" | ||
wget -qO- {params.url}/battenberg_impute_{params.alt_build}.tar.gz | | ||
tar -xvz > {output.battenberg_impute} -C {params.folder} | ||
&& | ||
wget -qO- {params.url}/battenberg_{params.alt_build}_gc_correction.tar.gz | | ||
tar -xvz > {output.battenberg_gc_correction} -C {params.folder} | ||
&& | ||
wget -qO- {params.url}/battenberg_1000genomesloci_{params.alt_build}.tar.gz | | ||
tar -xvz > {output.genomesloci} -C {params.folder} | ||
&& | ||
wget -O {output.impute_info} {params.url}/impute_info_{params.alt_build}.txt | ||
&& | ||
python {params.PATH}/reference_correction.py {params.build} | ||
&& | ||
wget -qO- {params.url}/battenberg_{params.alt_build}_replic_correction.tar.gz | | ||
tar -xvz > {output.battenberg_wgs_replic_correction} -C {params.folder} | ||
&& | ||
wget -O {output.probloci} {params.url}/probloci_{params.alt_build}.txt.gz | ||
""") | ||
|
||
|
||
# Symlinks the input files into the module results directory (under '00-inputs/') | ||
rule _battenberg_input_bam: | ||
input: | ||
bam = CFG["inputs"]["sample_bam"] | ||
output: | ||
bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam", | ||
bai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.bai", | ||
crai = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{sample_id}.bam.crai" | ||
group: "setup_run" | ||
run: | ||
op.absolute_symlink(input.bam, output.bam) | ||
op.absolute_symlink(input.bam + ".bai", output.bai) | ||
op.absolute_symlink(input.bam + ".bai", output.crai) | ||
|
||
# Installs the Battenberg R dependencies and associated software (impute2, alleleCounter) | ||
# Currently I think this rule has to be run twice for it to work properly because the conda environment is created here. | ||
# I am open to suggestions for how to get around this. | ||
rule _install_battenberg: | ||
output: | ||
complete = CFG["dirs"]["inputs"] + "battenberg_dependencies_installed.success" | ||
conda: | ||
CFG["conda_envs"]["battenberg"] | ||
log: | ||
input = CFG["logs"]["inputs"] + "input.log" | ||
shell: | ||
""" | ||
R -q -e 'devtools::install_github("Crick-CancerGenomics/ascat/ASCAT")' >> {log.input} && ##move some of this to config? | ||
R -q -e 'devtools::install_github("morinlab/battenberg")' >> {log.input} && ##move some of this to config? | ||
touch {output.complete}""" | ||
|
||
# this process is very fast on bam files and painfully slow on cram files. | ||
# The result of calc_sex_status.sh is stored in a file to avoid having to rerun it unnecessarily | ||
rule _infer_patient_sex: | ||
input: | ||
normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", | ||
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa") | ||
output: sex_result = CFG["dirs"]["infer_sex"] + "{seq_type}--{genome_build}/{normal_id}.sex" | ||
resources: | ||
**CFG["resources"]["infer_sex"] | ||
log: | ||
stderr = CFG["logs"]["infer_sex"] + "{seq_type}--{genome_build}/{normal_id}_infer_sex_stderr.log" | ||
conda: | ||
CFG["conda_envs"]["samtools"] | ||
group: "setup_run" | ||
threads: 8 | ||
shell: | ||
op.as_one_line(""" | ||
PATH={SCRIPT_PATH}:$PATH; | ||
echo "running {rule} for {wildcards.normal_id} on $(hostname) at $(date)" > {log.stderr} ; | ||
calc_sex_status.sh {input.normal_bam} {input.fasta} {wildcards.normal_id} > {output.sex_result} 2>> {log.stderr} && | ||
echo "DONE running {rule} for {wildcards.normal_id} on $(hostname) at $(date)" >> {log.stderr} | ||
""") | ||
|
||
|
||
# This rule runs the entire Battenberg pipeline. Eventually we may want to set this rule up to allow re-starting | ||
# of partially completed jobs (e.g. if they run out of RAM and are killed by the cluster, they can automatically retry) | ||
rule _run_battenberg: | ||
input: | ||
tumour_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{tumour_id}.bam", | ||
normal_bam = CFG["dirs"]["inputs"] + "bam/{seq_type}--{genome_build}/{normal_id}.bam", | ||
installed = CFG["dirs"]["inputs"] + "battenberg_dependencies_installed.success", | ||
sex_result = CFG["dirs"]["infer_sex"] + "{seq_type}--{genome_build}/{normal_id}.sex", | ||
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), | ||
impute_info = CFG["dirs"]["inputs"] + "reference/{genome_build}/impute_info.txt" | ||
|
||
output: | ||
refit=CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_refit_suggestion.txt", | ||
sub=CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_subclones.txt", | ||
ac=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_alleleCounts.tab"), | ||
mb=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_mutantBAF.tab"), | ||
mlrg=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_mutantLogR_gcCorrected.tab"), | ||
mlr=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_mutantLogR.tab"), | ||
nlr=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_normalLogR.tab"), | ||
nb=temp(CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_normalBAF.tab"), | ||
cp=CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_cellularity_ploidy.txt" | ||
log: | ||
stdout = CFG["logs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_battenberg.stdout.log", | ||
stderr = CFG["logs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_battenberg.stderr.log" | ||
params: | ||
fasta = reference_files("genomes/{genome_build}/genome_fasta/genome.fa"), | ||
script = CFG["inputs"]["battenberg_script"], | ||
out_dir = CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}", | ||
ref = CFG["dirs"]["inputs"] + "reference/{genome_build}" | ||
conda: | ||
CFG["conda_envs"]["battenberg"] | ||
resources: | ||
**CFG["resources"]["battenberg"] | ||
threads: | ||
CFG["threads"]["battenberg"] | ||
shell: | ||
op.as_one_line(""" | ||
if [[ $(head -c 4 {params.fasta}) == ">chr" ]]; then chr_prefixed='true'; else chr_prefixed='false'; fi; | ||
echo "$chr_prefixed" | ||
echo "running {rule} for {wildcards.tumour_id}--{wildcards.normal_id} on $(hostname) at $(date)" > {log.stdout}; | ||
sex=$(cut -f 4 {input.sex_result}| tail -n 1); | ||
echo "setting sex as $sex"; | ||
Rscript {params.script} -t {wildcards.tumour_id} | ||
-n {wildcards.normal_id} --tb {input.tumour_bam} --nb {input.normal_bam} -f {input.fasta} --reference {params.ref} | ||
-o {params.out_dir} --chr_prefixed_genome $chr_prefixed --sex $sex --cpu {threads} >> {log.stdout} 2>> {log.stderr} && | ||
echo "DONE {rule} for {wildcards.tumour_id}--{wildcards.normal_id} on $(hostname) at $(date)" >> {log.stdout}; | ||
""") | ||
|
||
|
||
# Convert the subclones.txt (best fit) to igv-friendly SEG files. | ||
rule _battenberg_to_igv_seg: | ||
input: | ||
sub = rules._run_battenberg.output.sub, | ||
cnv2igv = CFG["inputs"]["cnv2igv"] | ||
output: | ||
seg = CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_subclones.igv.seg" | ||
log: | ||
stderr = CFG["logs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_seg2igv.stderr.log" | ||
threads: 1 | ||
group: "post_process" | ||
shell: | ||
op.as_one_line(""" | ||
echo "running {rule} for {wildcards.tumour_id}--{wildcards.normal_id} on $(hostname) at $(date)" > {log.stderr}; | ||
python {input.cnv2igv} --mode battenberg --sample {wildcards.tumour_id} | ||
{input.sub} > {output.seg} 2>> {log.stderr} | ||
""") | ||
|
||
|
||
#due to the large number of files (several per chromosome) that are not explicit outputs, do some glob-based cleaning in the output directory | ||
rule _battenberg_cleanup: | ||
input: | ||
rules._battenberg_to_igv_seg.output.seg | ||
output: | ||
complete = CFG["dirs"]["battenberg"] + "{seq_type}--{genome_build}/{tumour_id}--{normal_id}/{tumour_id}_cleanup_complete.txt" | ||
group: "post_process" | ||
shell: | ||
op.as_one_line(""" | ||
d=$(dirname {output}); | ||
rm -f $d/*impute_input* && | ||
rm -f $d/*alleleFrequencies* && | ||
rm -f $d/*aplotype* && | ||
rm -f $d/*BAFsegmented* && | ||
touch {output.complete} | ||
""") | ||
|
||
# Symlinks the final output files into the module results directory (under '99-outputs/') | ||
# All plots generated by Battenberg are symlinked using a glob for convenience | ||
|
||
rule _battenberg_output_seg: | ||
input: | ||
seg = rules._battenberg_to_igv_seg.output.seg, | ||
sub = rules._run_battenberg.output.sub, | ||
cp = rules._run_battenberg.output.cp | ||
output: | ||
seg = CFG["dirs"]["outputs"] + "seg/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.igv.seg", | ||
sub = CFG["dirs"]["outputs"] + "txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_subclones.txt", | ||
cp = CFG["dirs"]["outputs"] + "txt/{seq_type}--{genome_build}/{tumour_id}--{normal_id}_cellularity_ploidy.txt" | ||
params: | ||
batt_dir = CFG["dirs"]["battenberg"] + "/{seq_type}--{genome_build}/{tumour_id}--{normal_id}", | ||
png_dir = CFG["dirs"]["outputs"] + "png/{seq_type}--{genome_build}" | ||
group: "post_process" | ||
run: | ||
plots = glob.glob(params.batt_dir + "/*.png") | ||
for png in plots: | ||
bn = os.path.basename(png) | ||
op.relative_symlink(png, params.png_dir + "/" + bn,in_module=True) | ||
op.relative_symlink(input.seg, output.seg,in_module=True) | ||
op.relative_symlink(input.sub, output.sub,in_module=True) | ||
op.relative_symlink(input.cp, output.cp,in_module=True) | ||
|
||
# Generates the target sentinels for each run, which generate the symlinks | ||
rule _battenberg_all: | ||
input: | ||
expand( | ||
[ | ||
rules._run_battenberg.output.sub, | ||
rules._battenberg_output_seg.output.seg, | ||
rules._battenberg_cleanup.output.complete | ||
], | ||
zip, # Run expand() with zip(), not product() | ||
seq_type=CFG["runs"]["tumour_seq_type"], | ||
genome_build=CFG["runs"]["tumour_genome_build"], | ||
tumour_id=CFG["runs"]["tumour_sample_id"], | ||
normal_id=CFG["runs"]["normal_sample_id"], | ||
pair_status=CFG["runs"]["pair_status"]) | ||
|
||
|
||
##### CLEANUP ##### | ||
|
||
|
||
# Perform some clean-up tasks, including storing the module-specific | ||
# configuration on disk and deleting the `CFG` variable | ||
op.cleanup_module(CFG) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
lcr-modules: | ||
battenberg: | ||
inputs: | ||
# Available wildcards: {seq_type} {genome_build} {sample_id} | ||
sample_bam: "__UPDATE__" | ||
battenberg_script: "{MODSDIR}/src/battenberg_wgs_hg38.R" | ||
cnv2igv: "{SCRIPTSDIR}/cnv2igv/1.4/cnv2igv.py" | ||
src_dir: "{MODSDIR}/src/" | ||
|
||
scratch_subdirectories: [] | ||
|
||
conda_envs: | ||
battenberg: "{MODSDIR}/envs/battenberg-1.1.yaml" | ||
wget: "{MODSDIR}/envs/wget-1.20.1.yaml" | ||
samtools: "{MODSDIR}/envs/samtools-1.9.yaml" | ||
|
||
resources: | ||
battenberg: | ||
mem_mb: 200000 | ||
bam: 1 | ||
infer_sex: | ||
mem_mb: 20000 | ||
bam: 1 | ||
reference: | ||
mem_mb: 8000 | ||
|
||
threads: | ||
battenberg: 24 | ||
reference: 2 | ||
#ideal for processing all chromosomes at once | ||
|
||
pairing_config: | ||
genome: | ||
run_paired_tumours: True | ||
run_unpaired_tumours_with: null | ||
run_paired_tumours_as_unpaired: False | ||
capture: | ||
run_paired_tumours: True | ||
run_unpaired_tumours_with: null | ||
run_paired_tumours_as_unpaired: False |
Oops, something went wrong.