From 25e73d4c79846a8f9150bdb59cd36d508900f0a9 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 11:54:27 +0100 Subject: [PATCH 01/46] refactor: use abstract cnvkit wrapper --- .../wrappers/cnvkit/call/wrapper.py | 120 ++++++++---------- .../wrappers/cnvkit/fix/wrapper.py | 76 +++-------- .../wrappers/cnvkit/segment/wrapper.py | 86 ++++--------- 3 files changed, 95 insertions(+), 187 deletions(-) diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index 987a502ef..c77d8863b 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -1,72 +1,60 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py call""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -center = config["center"] -if center: - if center in set("mean", "median", "mode", "biweight"): - center = " --center " + center - else: - center = " --center-at" + center - -gender = " --gender {}".format(config["gender"]) if config["gender"] else "" -male = " --male-reference" if config["male_reference"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - +import re + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +class CnvkitWrapperCall(CnvkitWrapper): + PURITY_PATTERN = re.compile("^Purity: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") + PLOIDY_PATTERN = re.compile("^Ploidy: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") + + def preamble(self): + if "purity" in self.snakemake.input: + with open(self.snakemake.input.purity, "rt") as f: + for line in f: + m = CnvkitWrapperCall.PURITY_PATTERN.match(line.strip()) + if m: + self.purity = float(m.groups()[1]) + else: + m = CnvkitWrapperCall.PLOIDY_PATTERN.match(line.strip()) + if m: + self.ploidy = float(m.groups()[1]) + else: + self.purity = self.snakemake.params.purity if "purity" in self.snakemake.params else None + self.ploidy = self.snakemake.params.ploidy if "ploidy" in self.snakemake.params else None + + self.cmd = self.cmd.format(purity=self.purity, ploidy=self.ploidy) + +if "variants" in snakemake.input: + variants = r""" + ---vcf {snakemake.input.variants} \ + {snakemake.params.sample_id} {snakemake.params.normal_id} \ + {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + """.format( + snakemake=snakemake, + ) +else: + variants = "" + +cmd = r""" cnvkit.py call \ - --output {snakemake.output.calls} \ - --method {config[calling_method]} \ - --thresholds={config[call_thresholds]} \ - $(if [[ "{config[filter]}" ]]; then \ - echo --filter {config[filter]} - fi) \ - {center} {gender} {male} \ - --ploidy {config[ploidy]} \ - $(if [[ {config[purity]} -gt 0 ]]; then \ - echo --purity {config[purity]} - fi) \ - {snakemake.input} - -d=$(dirname "{snakemake.output.calls}") -pushd $d -fn=$(basename "{snakemake.output.calls}") -md5sum $fn > $fn.md5 -popd -""" + -o {snakemake.output.calls} \ + --method {snakemake.params.method} --thresholds={snakemake.params.thresholds} \ + --filter {snakemake.params.filter} \ + {center} \ + {drop_low_coverage} \ + {sample_sex} {male_reference} \ + {variants} \ + {{purity}} {{ploidy}} \ + {snakemake.input.segments} +""".format( + snakemake=snakemake, + center=f"--center-at {snakemake.params.center_at}" if "center_at" in snakemake.params else f"--center {snakemake.params.center}", + drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", + sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", + male_reference="--male-reference" if snakemake.params.male_reference else "", + variants=variants, ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapperCall(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 537ff5f77..97387dbcb 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -1,68 +1,24 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py fix""" -from snakemake.shell import shell +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -if "ref" in snakemake.input.keys(): - ref = snakemake.input.target -elif "path_panel_of_normals" in config.keys(): - ref = config["path_panel_of_normals"] -else: - raise Exception("Unsupported naming") - -gender = " --gender {}".format(config["gender"]) if config["gender"] else "" -male = " --male-reference" if config["male_reference"] else "" -no_gc = " --no-gc" if not config["gc_correction"] else "" -no_edge = " --no-edge" if not config["edge_correction"] else "" -no_rmask = " --no-rmask" if not config["rmask_correction"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" +cmd = r""" cnvkit.py fix \ - --output {snakemake.output.ratios} \ - {gender} {male} {no_gc} {no_edge} {no_rmask} \ - {snakemake.input.target} \ - {snakemake.input.antitarget} \ - {ref} - -d=$(dirname "{snakemake.output.ratios}") -pushd $d -fn=$(basename "{snakemake.output.ratios}") -md5sum $fn > $fn.md5 -popd -""" + -o {snakemake.output.coverage} \ + {cluster} {snakemake.params.sample_id} \ + {no_gc} {no_edge} {no_rmask} \ + {snakemake.input.target} {antitarget} {snakemake.input.reference} +""".format( + snakemake=snakemake, + cluster="--cluster" if snakemake.params.cluster else "", + no_gc="--no-gc" if snakemake.params.no_gc else "", + no_edge="--no-edge" if snakemake.params.no_edge else "", + no_rmask="--no-rmask" if snakemake.params.no_rmask else "", + antitarget=f"{snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 5a02054e2..648e14a2a 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -1,67 +1,31 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py segment""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -method = config["segmentation_method"] -if method == "cbs" and config["smooth_cbs"]: - method += " --smooth-cbs" - -if float(config["segmentation_threshold"]) > 0: - threshold = " --threshold " + str(config["segmentation_threshold"]) +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +if "variants" in snakemake.input: + variants = r""" + ---vcf {snakemake.input.variants} \ + {snakemake.params.sample_id} {snakemake.params.normal_id} \ + {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + """.format( + snakemake=snakemake, + ) else: - threshold = "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - -cnvkit.py segment \ - --output {snakemake.output.segments} \ - --method {method} \ - $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ - echo --drop-low-coverage - fi) \ - {threshold} \ - --drop-outliers {config[drop_outliers]} \ - {snakemake.input} - -d=$(dirname "{snakemake.output.segments}") -pushd $d -fn=$(basename "{snakemake.output.segments}") -md5sum $fn > $fn.md5 -popd -""" + variants = "" + +cmd = r""" +cnvkit.py segment --processes {snakemake.params.proceses} \ + -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ + --method {snakemake.params.method} --threshold {snakemake.params.threshold} {smooth_cbs} \ + {drop_low_coverage} --drop-outliers {snakemake.params.drop_outliers} \ + {variants} \ + {snakemake.input.coverage} +""".format( + snakemake=snakemake, + smooth_cbs="--smooth-cbs" if snakemake.params.smooth_cbs else "", + drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", + variants=variants, ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() From e2dac8f656d7dfe75a2fb927ccdfd625d584d20d Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 11:56:17 +0100 Subject: [PATCH 02/46] refactor: mbcs vs somatic combined tool naming --- .../workflows/ngs_mapping/model.py | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 30968a324..56f00a7bc 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -1,4 +1,5 @@ import enum +import itertools import os from enum import Enum from typing import Annotated @@ -21,8 +22,24 @@ class RnaMapper(Enum): STAR = "star" +class MetaTool(Enum): + MBCS = "mbcs" + + +CombinedDnaTool = enum.Enum( + "CombinedDnaTool", + { + (name, member.value) + for name, member in itertools.chain( + DnaMapper.__members__.items(), MetaTool.__members__.items() + ) + }, +) +"""DNA mappers or (mbcs) meta-tool""" + + class Tools(SnappyModel): - dna: Annotated[list[DnaMapper], EnumField(DnaMapper, [])] + dna: Annotated[list[CombinedDnaTool], EnumField(CombinedDnaTool, [])] """Required if DNA analysis; otherwise, leave empty.""" rna: Annotated[list[RnaMapper], EnumField(RnaMapper, [])] @@ -133,17 +150,6 @@ class BarcodeTool(Enum): AGENT = "agent" -class Somatic(SnappyModel): - mapping_tool: DnaMapper - """Either bwa of bwa_mem2. The indices & other parameters are taken from mapper config""" - - barcode_tool: BarcodeTool = BarcodeTool.AGENT - """Only agent currently implemented""" - - use_barcodes: bool = False - recalibrate: bool = True - - class Bqsr(SnappyModel): common_variants: str """Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)""" @@ -261,8 +267,13 @@ class Minimap2(SnappyModel): class Mbcs(SnappyModel): mapping_tool: DnaMapper - use_barcodes: bool - recalibrate: bool + """Either bwa of bwa_mem2. The indices & other parameters are taken from mapper config""" + + barcode_tool: BarcodeTool = BarcodeTool.AGENT + """Only agent currently implemented""" + + use_barcodes: bool = False + recalibrate: bool = True class NgsMapping(SnappyStepModel): @@ -287,7 +298,7 @@ class NgsMapping(SnappyStepModel): bwa_mem2: BwaMem2 | None = None """Configuration for BWA-MEM2""" - somatic: Somatic | None = None + mbcs: Mbcs | None = None """ Configuration for somatic ngs_calling (separate read groups, molecular barcodes & base quality recalibration) @@ -304,8 +315,6 @@ class NgsMapping(SnappyStepModel): minimap2: Minimap2 | None = None - mbcs: Mbcs | None = None - @model_validator(mode="after") def ensure_tools_are_configured(self): for data_type in ("dna", "rna", "dna_long"): From 82aff6317e9044587cf221bfbaa496addf7fb4b7 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 11:58:11 +0100 Subject: [PATCH 03/46] featu: Adding the somatic cnv calling step --- snappy_pipeline/apps/snappy_snake.py | 2 ++ snappy_pipeline/workflow_model.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index f5f8bf3c2..9e99f28f0 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -31,6 +31,7 @@ ngs_mapping, panel_of_normals, repeat_expansion, + somatic_cnv_calling, somatic_cnv_checking, somatic_gene_fusion_calling, somatic_hla_loh_calling, @@ -82,6 +83,7 @@ "ngs_data_qc": ngs_data_qc, "panel_of_normals": panel_of_normals, "repeat_analysis": repeat_expansion, + "somatic_cnv_calling": somatic_cnv_calling, "somatic_cnv_checking": somatic_cnv_checking, "somatic_gene_fusion_calling": somatic_gene_fusion_calling, "somatic_hla_loh_calling": somatic_hla_loh_calling, diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py index aeda7682e..498bb9f93 100644 --- a/snappy_pipeline/workflow_model.py +++ b/snappy_pipeline/workflow_model.py @@ -21,6 +21,7 @@ from snappy_pipeline.workflows.ngs_mapping.model import NgsMapping from snappy_pipeline.workflows.panel_of_normals.model import PanelOfNormals from snappy_pipeline.workflows.repeat_expansion.model import RepeatExpansion +from snappy_pipeline.workflows.somatic_cnv_calling.model import SomaticCnvCalling from snappy_pipeline.workflows.somatic_cnv_checking.model import SomaticCnvChecking from snappy_pipeline.workflows.somatic_gene_fusion_calling.model import SomaticGeneFusionCalling from snappy_pipeline.workflows.somatic_hla_loh_calling.model import SomaticHlaLohCalling @@ -109,6 +110,7 @@ class StepConfig(TypedDict, total=False): ngs_mapping: NgsMapping panel_of_normals: PanelOfNormals repeat_expansion: RepeatExpansion + somatic_cnv_calling: SomaticCnvCalling somatic_cnv_checking: SomaticCnvChecking somatic_gene_fusion_calling: SomaticGeneFusionCalling somatic_hla_loh_calling: SomaticHlaLohCalling From 4534cc8bcea2551c238ee7b2d3370107a352b6aa Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:00:24 +0100 Subject: [PATCH 04/46] refactor: parallel wrapper configuration model --- snappy_pipeline/models/__init__.py | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/snappy_pipeline/models/__init__.py b/snappy_pipeline/models/__init__.py index 282a05ab9..438145235 100644 --- a/snappy_pipeline/models/__init__.py +++ b/snappy_pipeline/models/__init__.py @@ -79,6 +79,46 @@ def keys(self): return self.model_fields.keys() +class Parallel(SnappyModel): + num_cores: int = 2 + """number of cores to use locally""" + + window_length: int = 3500000 + """split input into windows of this size, each triggers a job""" + + num_jobs: int = 500 + """number of windows to process in parallel""" + + use_profile: bool = True + """use Snakemake profile for parallel processing""" + + restart_times: int = 5 + """number of times to re-launch jobs in case of failure""" + + max_jobs_per_second: int = 2 + """throttling of job creation""" + + max_status_checks_per_second: int = 10 + """throttling of status checks""" + + debug_trunc_tokens: int = 0 + """truncation to first N tokens (0 for none)""" + + keep_tmpdir: KeepTmpdir = KeepTmpdir.never + """keep temporary directory, {always, never, onerror}""" + + job_mult_memory: float = 1 + """memory multiplier""" + + job_mult_time: float = 1 + """running time multiplier""" + + merge_mult_memory: float = 1 + """memory multiplier for merging""" + + merge_mult_time: float = 1 + + # This exists to distinguish workflow step_config models from other snappy specific models # It also provides a default_config_yaml_string method that includes the step_config section # by default. From 2281092e4a984dbd33b98facbcfcd84f792d471b Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:02:51 +0100 Subject: [PATCH 05/46] refactor: use abstract cnvkit wrapper --- snappy_wrappers/wrappers/cnvkit/__init__.py | 0 .../wrappers/cnvkit/cnvkit_wrapper.py | 85 +++++++++++++++++++ .../wrappers/cnvkit/environment.yaml | 6 +- 3 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 snappy_wrappers/wrappers/cnvkit/__init__.py create mode 100644 snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py diff --git a/snappy_wrappers/wrappers/cnvkit/__init__.py b/snappy_wrappers/wrappers/cnvkit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py new file mode 100644 index 000000000..a6a8accdf --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py @@ -0,0 +1,85 @@ +"""Abstract wrapper for cnvkit.py""" + +import textwrap + +from snakemake.shell import shell + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + + +class CnvkitWrapper: + header = r""" + # Also pipe everything to log file + if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi + fi + + compute_md5() {{ + fn=$1 + f=$(basename $fn) + d=$(dirname $fn) + pushd $d 1> /dev/null 2>&1 + md5sum $f > $f.md5 + popd 1> /dev/null 2>&1 + }} + + # Write out information about conda installation. + conda list >{snakemake.log.conda_list} + conda info >{snakemake.log.conda_info} + compute_md5 {snakemake.log.conda_list} + compute_md5 {snakemake.log.conda_info} + + # Create temp directory + TMPDIR=$(mktemp -d) + + set -x + + # --------------------------------- Start command ----------------------------------------- + """ + + footer = r""" + # --------------------------------- End command ------------------------------------------- + + for fn in {snakemake.output} + do + compute_md5 $fn + done + compute_md5 {snakemake.log.sh} + """ + + md5_log = r""" + f=$(basename {log}) + d=$(dirname {log}) + pushd $d 1> /dev/null 2>&1 + md5sum $f > $f.md5 + popd 1> /dev/null 2>&1 + """ + + def __init__(self, snakemake, command) -> None: + self.snakemake = snakemake + self.command = command + + def preamble(self): + pass + + def run(self) -> None: + self.preamble() + + with open(self.snakemake.log.sh, "wt") as f: + print( + textwrap.dedent( + "\n".join((CnvkitWrapper.header, self.command, CnvkitWrapper.footer)) + ), + file=f, + ) + + shell(self.snakemake.log.sh) + + shell(CnvkitWrapper.md5_log.format(log=str(self.snakemake.log.log))) diff --git a/snappy_wrappers/wrappers/cnvkit/environment.yaml b/snappy_wrappers/wrappers/cnvkit/environment.yaml index 346a15842..2def14828 100644 --- a/snappy_wrappers/wrappers/cnvkit/environment.yaml +++ b/snappy_wrappers/wrappers/cnvkit/environment.yaml @@ -2,6 +2,6 @@ channels: - conda-forge - bioconda dependencies: - - python==3.10.14 - - cnvkit==0.9.10 - - htslib==1.19.1 + - python=3.12 + - cnvkit==0.9.8 + - htslib=1.21 From 13b676f86b9ff372a4eaaddba7d25e10dfc41544 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:04:52 +0100 Subject: [PATCH 06/46] refactor: cnvkit panel of normals with improved logic (WIP) --- .../workflows/panel_of_normals/Snakefile | 48 +- .../workflows/panel_of_normals/__init__.py | 461 ++++++++++++------ .../workflows/panel_of_normals/model.py | 30 +- .../wrappers/cnvkit/access/wrapper.py | 62 +-- .../wrappers/cnvkit/antitarget/wrapper.py | 78 +-- .../wrappers/cnvkit/coverage/wrapper.py | 99 +--- .../wrappers/cnvkit/reference/wrapper.py | 112 +---- .../wrappers/cnvkit/target/wrapper.py | 128 ++--- .../test_workflows_panel_of_normals.py | 253 ++++++---- 9 files changed, 637 insertions(+), 634 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 88e132382..5acd85d39 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -113,6 +113,8 @@ rule panel_of_normals_access_run: partition=wf.get_resource("access", "run", "partition"), log: **wf.get_log_file("access", "run"), + params: + **wf.get_params("access", "run"), wrapper: wf.wrapper_path("cnvkit/access") @@ -120,6 +122,42 @@ rule panel_of_normals_access_run: # Write out the normals-only results for the normals -------------------------- +rule panel_of_normals_cnvkit_access: + input: + unpack(wf.get_input_files("cnvkit", "access")), + output: + **wf.get_output_files("cnvkit", "access"), + threads: wf.get_resource("cnvkit", "access", "threads") + resources: + time=wf.get_resource("cnvkit", "access", "time"), + memory=wf.get_resource("cnvkit", "access", "memory"), + partition=wf.get_resource("cnvkit", "access", "partition"), + log: + **wf.get_log_file("cnvkit", "access"), + params: + **{"args": wf.get_params("cnvkit", "access")}, + wrapper: + wf.wrapper_path("cnvkit/access") + + +rule panel_of_normals_cnvkit_autobin: + input: + unpack(wf.get_input_files("cnvkit", "autobin")), + output: + **wf.get_output_files("cnvkit", "autobin"), + threads: wf.get_resource("cnvkit", "autobin", "threads") + resources: + time=wf.get_resource("cnvkit", "autobin", "time"), + memory=wf.get_resource("cnvkit", "autobin", "memory"), + partition=wf.get_resource("cnvkit", "autobin", "partition"), + log: + **wf.get_log_file("cnvkit", "autobin"), + params: + **{"args": wf.get_params("cnvkit", "autobin")}, + wrapper: + wf.wrapper_path("cnvkit/autobin") + + rule panel_of_normals_cnvkit_target: input: unpack(wf.get_input_files("cnvkit", "target")), @@ -133,7 +171,7 @@ rule panel_of_normals_cnvkit_target: log: **wf.get_log_file("cnvkit", "target"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "target"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/target") @@ -151,7 +189,7 @@ rule panel_of_normals_cnvkit_antitarget: log: **wf.get_log_file("cnvkit", "antitarget"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "antitarget"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/antitarget") @@ -169,7 +207,7 @@ rule panel_of_normals_cnvkit_coverage: log: **wf.get_log_file("cnvkit", "coverage"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "coverage"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/coverage") @@ -190,7 +228,7 @@ rule panel_of_normals_cnvkit_create_panel: log: **wf.get_log_file("cnvkit", "create_panel"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "create_panel"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/reference") @@ -208,7 +246,7 @@ rule panel_of_normals_cnvkit_report: log: **wf.get_log_file("cnvkit", "report"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "report"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/report") diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index d99755214..77d76abc9 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -68,12 +68,10 @@ .. note:: In a nutshell, for exome data, the accessibility file is only used to create antitarget regions. - For genome data, it is used by the ``autobin`` tool to compute the average target size used during target regions creation. - If it is present, the target size is computed in amplicon mode, and when it is absent, - an accessibility file is created with default settings, which value is used by ``autobin`` is whole genome mode. + These regions are essentially the accessible regions minus the target regions (with edge effect correction). -To generate the access file from a bed file containing regions to exclude from further coverage computations, -the user must proceed in two steps: +Access files can be generated from the genome reference ``fasta`` file, and optionally ``bed`` file(s) containing regions to exclude from further computations. +In this case, the user must proceed in two steps: First, she needs to run the ``access`` tool to create the desired access file @@ -86,6 +84,8 @@ This will create ``output/cnvkit.access/out/cnvkit.access.bed`` from the genomic sequence & excluded regions. +When there are no exclusion regions, the access file is automatically created using only the reference genome, and removing masked regions. + ------------------------ Panel of normal creation ------------------------ @@ -119,6 +119,28 @@ The cnvkit authors recommend to check these reports to ensure that all data is suitable for panel of normal creation. +---------------------- +Notes on the algorithm +---------------------- + +The choice of steps depends on the library type: whole exome sequencing is different from whole genome sequencing (panel not implemented yet). + +The reference is assembled on coverage computed for all normal samples. +The coverage is always computed on target regions, and separately on antitarget regions only for exome data, not for whole genome or panel data. + +For exome and panel data, target regions are obtained from the baits bed file, adding gene information & edge effects correction in the case of exome data. +For WGS data, the target regions are the full accessible regions in the genome. The user can define those accessible region (using ``access``). +But when she has left this option empty, the accessible regions are automatically defined based on the reference genome. + +To create the target regions from the baits (or from the accessible regions), the target average bin size must be set. +There is a reasonable default value for exome data, but an additional ``autobin`` step is required for the whole genome data. +In ``batch`` mode, this value is computed from the coverage over the full genome +.. note:: + + The ``cnvkit batch`` command also allows the creation of a flat reference, when there are no normal samples. + This is not implemented in the ``panel_of_normals`` step, for obvious reasons. + Using a flat reference for CNV computations is nevertheless possible, it is implemented in the ``somatic_cnv_calling`` step. + ================ Notes ``purecn`` ================ @@ -147,6 +169,8 @@ """ +from enum import StrEnum + from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions from snappy_pipeline.utils import dictify, listify @@ -169,6 +193,13 @@ DEFAULT_CONFIG = PanelOfNormalsConfigModel.default_config_yaml_string() +#: Known library types +class LibraryType(StrEnum): + WES = "WES" + WGS = "WGS" + Panel = "Panel" + + class PanelOfNormalsStepPart(BaseStepPart): """Base class for panel of normals step parts @@ -182,29 +213,65 @@ class PanelOfNormalsStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) # Build shortcut from cancer bio sample name to matched cancer sample - self.normal_libraries = list(self._get_normal_libraries()) + known_libraries = self._get_normal_libraries() + self.normal_libraries = list(known_libraries.keys()) if self.name and (cfg := self.config.get(self.name)): if path := cfg.get("path_normals_list"): self.normal_libraries = [] with open(path, "rt") as f: for line in f: + if line.startswith("#"): + continue self.normal_libraries.append(line.strip()) + self.libraryType, self.libraryKit = self._validate_normal_libraries(known_libraries) def _get_normal_libraries(self): + normal_libraries = {} for sheet in self.parent.shortcut_sheets: for donor in sheet.donors: - for _, bio_sample in donor.bio_samples.items(): + for bio_sample in donor.bio_samples.values(): if bio_sample.is_tumor: continue - for _, test_sample in bio_sample.test_samples.items(): + for test_sample in bio_sample.test_samples.values(): extraction_type = test_sample.extra_infos.get("extractionType", "DNA") if extraction_type.lower() == "dna": - for _, ngs_library in test_sample.ngs_libraries.items(): - yield ngs_library.name + for library in test_sample.ngs_libraries.values(): + normal_libraries[library.name] = self._get_extra_info(library) + return normal_libraries + + def _validate_normal_libraries(self, known_libraries): + libraryType = None + libraryKit = None + for library in self.normal_libraries: + assert ( + library in known_libraries + ), f"Unknown normal library {library} requested to build pon" + assert ( + libraryType is None or libraryType == known_libraries[library]["libraryType"] + ), "Panel of normal cannot be built from multiple library types" + libraryType = known_libraries[library]["libraryType"] + if libraryType == LibraryType.WES: + assert ( + libraryKit is None or libraryKit == known_libraries[library]["libraryKit"] + ), "Panel of normal cannot be built from multiple library kits" + libraryKit = known_libraries[library]["libraryKit"] + return (libraryType, libraryKit) + + @staticmethod + def _get_extra_info(library): + extra_info = {} + assert "libraryType" in library.extra_infos, f"Undefined type of library {library.name}" + extra_info["libraryType"] = library.extra_infos.get("libraryType", "Illumina") + if extra_info["libraryType"] == LibraryType.WES: + assert ( + "libraryKit" in library.extra_infos + ), f"Undefined exome kit for library {library.name}" + extra_info["libraryKit"] = library.extra_infos.get("libraryKit", "__default__") + return extra_info @staticmethod @dictify - def _get_log_file(tpl): + def _get_log_file(tpl, has_sh=False): """Return all log files files""" ext_dict = { "conda_list": "conda_list.txt", @@ -214,6 +281,9 @@ def _get_log_file(tpl): "log": "log", "log_md5": "log.md5", } + if has_sh: + ext_dict["sh"] = "sh" + ext_dict["sh_md5"] = ext_dict["sh"] + ".md5" for key, ext in ext_dict.items(): yield key, tpl + "." + ext @@ -288,6 +358,8 @@ def _get_input_files_create(self, wildcards): def get_output_files(self, action): self._validate_action(action) + if self.name not in self.config.tools: + return {} if action == "install": return {"container": "work/containers/out/purecn.simg"} @@ -320,6 +392,9 @@ def get_output_files(self, action): } def get_log_file(self, action): + if self.name not in self.config.tools: + return {} + tpls = { "install": "work/containers/log/purecn", "prepare": "work/purecn/log/{}_{}".format( @@ -422,13 +497,15 @@ def get_log_file(cls, action): class CnvkitStepPart(PanelOfNormalsStepPart): - """Somatic variant calling with MuTect 2""" + """Build reference covergage for cnvkit""" #: Step name name = "cnvkit" #: Class available actions actions = ( + "access", + "autobin", "target", "antitarget", "coverage", @@ -467,7 +544,6 @@ class CnvkitStepPart(PanelOfNormalsStepPart): def __init__(self, parent): super().__init__(parent) - self.is_wgs = self.config.cnvkit.path_target_regions == "" def check_config(self): if self.name not in self.config.tools: @@ -477,48 +553,173 @@ def check_config(self): "Path to reference FASTA not configured but required for %s" % (self.name,), ) - def get_args(self, action): - self._validate_action(action) - if self.is_wgs: - method = "wgs" - else: - method = "hybrid" - return {"method": method, "flat": (len(self.normal_libraries) == 0)} - def get_input_files(self, action): """Return input files for cnvkit panel of normals creation""" # Validate action self._validate_action(action) mapping = { + "access": self._get_input_files_access, + "autobin": self._get_input_files_autobin, "target": self._get_input_files_target, "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, "create_panel": self._get_input_files_create_panel, "report": self._get_input_files_report, - "access": None, } return mapping[action] - def _get_input_files_target(self, wildcards): - """Helper wrapper function to estimate target average size in wgs mode""" - if not self.is_wgs: - return {} + def get_params(self, action): + """Return panel of normal files""" + if action == "access": + return self._get_params_access + elif action == "autobin": + return self._get_params_autobin + elif action == "target": + return self._get_params_target + elif action == "antitarget": + return self._get_params_antitarget + elif action == "coverage": + return self._get_params_coverage + elif action == "create_panel": + return self._get_params_create_panel + elif action == "report": + return self._get_params_report + else: + self._validate_action(action) + + def get_output_files(self, action): + """Return panel of normal files""" + if action == "access": + return self._get_output_files_access() + elif action == "autobin": + return self._get_output_files_autobin() + elif action == "target": + return self._get_output_files_target() + elif action == "antitarget": + return self._get_output_files_antitarget() + elif action == "coverage": + return self._get_output_files_coverage() + elif action == "create_panel": + return self._get_output_files_create_panel() + elif action == "report": + return self._get_output_files_report() + else: + self._validate_action(action) + + @classmethod + def get_log_file(cls, action): + """Return panel of normal files""" + tpls = { + "access": "work/{mapper}.cnvkit/log/cnvkit.access", + "autobin": "work/{mapper}.cnvkit/log/cnvkit.autobin", + "target": "work/{mapper}.cnvkit/log/cnvkit.target", + "antitarget": "work/{mapper}.cnvkit/log/cnvkit.antitarget", + "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage", + "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", + "report": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report", + } + assert action in cls.actions + return cls._get_log_file(tpls[action], has_sh=True) + + def _get_input_files_access(self, wildcards): + return {} + + def _get_params_access(self, wildcards): + return {"reference": self.w_config.static_data_config.reference.path} + + def _get_output_files_access(self): + return {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"} + + def _get_input_files_autobin(self, wildcards): + assert ( + self.libraryType == LibraryType.WGS + ), "Trying to estimate average target size for non-WGS samples" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" bams = [ ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) for x in self.normal_libraries ] - bais = [x + ".bai" for x in bams] - input_files = {"bams": bams, "bais": bais} + input_files = {"bams": bams} + if self.config.cnvkit.get("access", "") == "": + input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) + return input_files + + def _get_params_autobin(self, wildcards): + assert ( + self.libraryType == LibraryType.WGS + ), "Trying to estimate average target size for non-WGS samples" + params = {} + if self.name in self.config.tools and self.config.cnvkit: + if self.config.cnvkit.get("access", "") == "": + params["method"] = "wgs" + else: + params["method"] = "amplicon" + params["target"] = self.config.cnvkit.get("access") + return params + + def _get_output_files_autobin(self): + return { + "result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt", + "target": "$TMPDIR/{mapper}.targets.bed", + "antitarget": "$TMPDIR/{mapper}.antitarget.bed", + } + + def _get_input_files_target(self, wildcards): + """Helper wrapper function to estimate target average size in wgs mode""" + input_files = {} + if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": + input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) + if self.config.cnvkit.get("target_avg_size", None) is None: + input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( + **wildcards + ) return input_files + def _get_params_target(self, wildcards): + params = {} + if self.name in self.config.tools: + if self.libraryType == LibraryType.WES: + params["target"] = self.config.cnvkit.path_target_regions + if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": + params["target"] = self.config.cnvkit.get("access") + if "features" in self.w_config.static_data_config: + params["annotate"] = self.w_config.static_data_config.features.path + if self.config.cnvkit.get("split", False): + params["split"] = True + if self.config.cnvkit.get("target_avg_size", None): + params["avg_size"] = self.config.cnvkit.get("target_avg_size") + return params + + def _get_output_files_target(self): + return { + "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", + } + def _get_input_files_antitarget(self, wildcards): """Helper wrapper function for computing antitarget locations""" - if self.is_wgs: + if self.libraryType == LibraryType.WGS: return {} return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), + "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed".format(**wildcards), + } + + def _get_params_antitarget(self, widlcards): + params = {} + if self.name in self.config.tools: + params = { + "avg_size": self.config.cnvkit.antitarget_avg_size, + "min_size": self.config.cnvkit.min_size, + } + if self.config.cnvkit.get("access", "") == "": + params["access"] = self.config.cnvkit.get("access") + return params + + def _get_output_files_antitarget(self): + return { + "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", } def _get_input_files_coverage(self, wildcards): @@ -527,42 +728,73 @@ def _get_input_files_coverage(self, wildcards): tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" bam = ngs_mapping(tpl.format(**wildcards)) return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format( - **wildcards - ), + "intervals": "work/{mapper}.cnvkit/out/cnvkit.{interval}.bed".format(**wildcards), "bam": bam, "bai": bam + ".bai", } + def _get_params_coverage(self, wildcards): + params = {} + if self.name in self.config.tools: + params = { + "reference": self.w_config.static_data_config.reference.path, + "min_mapq": self.config.cnvkit.min_mapq, + } + if self.config.cnvkit.get("count", False): + params["count"] = True + return params + + def _get_output_files_coverage(self): + return { + "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", + "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", + } + def _get_input_files_create_panel(self, wildcards): - """Helper wrapper function for computing panel of normals""" tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" targets = [ tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" - antitargets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries - ] - tpl = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage.{ext}" - logs = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x, ext=ext) - for x in self.normal_libraries - for ext in ("log", "conda_list.txt", "conda_info.txt") - ] + if self.libraryType == LibraryType.WES: + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + antitargets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) + for x in self.normal_libraries + ] + else: + antitargets = [] + return {"references": targets + antitargets} + + def _get_params_create_panel(self, wildcards): + params = {} + if self.name in self.config.tools: + params = { + "reference": self.w_config.static_data_config.reference.path, + } + if self.config.cnvkit.get("cluster", False): + params["cluster"] = True + params["min_cluster_size"] = self.config.cnvkit.min_cluster_size + if "sample_sex" in self.config.cnvkit: + params["sample_sex"] = self.config.cnvkit.gender + if self.config.cnvkit.get("male_reference", False): + params["male_reference"] = True + if self.config.cnvkit.get("diploid_parx_genome", None): + params["diploid_parx_genome"] = self.config.cnvkit.get("diploid_parx_genome") + if not self.config.cnvkit.get("gc_correction", True): + params["no_gc"] = True + if not self.config.cnvkit.get("rmask_correction", True): + params["no_rmask"] = True + if self.config.cnvkit.get("edge_correction", None) is None: + if self.libraryType != LibraryType.WES: + params["no_edge"] = True + elif not self.config.cnvkit.get("edge_correction"): + params["no_edge"] = True + return params + + def _get_output_files_create_panel(self): return { - "target": ( - targets - if targets - else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards) - ), - "antitarget": ( - antitargets - if antitargets - else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format(**wildcards) - ), - "logs": logs if targets or antitargets else [], + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", } def _get_input_files_report(self, wildcards): @@ -580,51 +812,6 @@ def _get_input_files_report(self, wildcards): "antitarget": antitargets, } - def get_output_files(self, action): - """Return panel of normal files""" - if action == "target": - return self._get_output_files_target() - elif action == "antitarget": - return self._get_output_files_antitarget() - elif action == "coverage": - return self._get_output_files_coverage() - elif action == "create_panel": - return self._get_output_files_create_panel() - elif action == "report": - return self._get_output_files_report() - elif action == "access": - return self._get_output_files_access() - else: - self._validate_action(action) - - def _get_output_files_target(self): - return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", - } - - def _get_output_files_antitarget(self): - return { - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", - } - - def _get_output_files_coverage(self): - return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", - } - - def _get_output_files_create_panel(self): - return { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - "log": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz", - "log_md5": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz.md5", - } - def _get_output_files_report(self): return { "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", @@ -633,26 +820,6 @@ def _get_output_files_report(self): "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", } - def _get_output_files_access(self): - return { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", - } - - @classmethod - def get_log_file(cls, action): - """Return panel of normal files""" - tpls = { - "target": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target", - "antitarget": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget", - "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage", - "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", - "report": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report", - "access": "work/cnvkit.access/log/cnvkit.access", - } - assert action in cls.actions - return cls._get_log_file(tpls[action]) - class AccessStepPart(PanelOfNormalsStepPart): """Utility to create access file for cnvkit""" @@ -677,14 +844,25 @@ def get_input_files(self, action): def get_output_files(self, action): # Validate action self._validate_action(action) - tpl = "work/cnvkit.access/out/cnvkit.access.bed" + tpl = "work/access/out/access.bed" return {"access": tpl, "access_md5": tpl + ".md5"} + def get_params(self, action): + # Validate action + self._validate_action(action) + if self.name in self.config.tools: + return { + "reference": self.w_config.static_data_config.reference.path, + "min_gap_size": self.config.access.min_gap_size, + "exclude": self.config.access.exclude, + } + return {} + @classmethod def get_log_file(cls, action): """Return log files""" assert action in cls.actions - return cls._get_log_file("work/cnvkit.access/log/cnvkit.access") + return cls._get_log_file("work/access/log/access", has_sh=True) class PanelOfNormalsWorkflow(BaseStep): @@ -757,39 +935,38 @@ def get_result_files(self): if "cnvkit" in set(self.config.tools) & set(TOOLS): tpls = [ - ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.target.{ext}", ("bed", "bed.md5")), - ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed", "bed.md5")), ( "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", ("cnn", "cnn.md5"), ), - ( - "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", - ("tsv", "tsv.md5"), - ), - ( - "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", - ("tsv", "tsv.md5"), - ), + # ( + # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", + # ("tsv", "tsv.md5"), + # ), + # ( + # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", + # ("tsv", "tsv.md5"), + # ), ] for tpl, ext_list in tpls: result_files.extend(self._expand_result_files(tpl, ext_list)) tpls = [ - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.target.{ext}", - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget.{ext}", + "output/{mapper}.cnvkit/log/cnvkit.target.{ext}", + "output/{mapper}.cnvkit/log/cnvkit.antitarget.{ext}", "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.report.{ext}", ] for tpl in tpls: - result_files.extend(self._expand_result_files(tpl, log_ext_list)) - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" - result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) + # tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" + # result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) if "access" in set(self.config.tools) & set(TOOLS): - tpl = "output/cnvkit.access/out/cnvkit.access.bed" + tpl = "output/access/out/access.bed" result_files.extend([tpl + md5 for md5 in ("", ".md5")]) - tpl = "output/cnvkit.access/log/cnvkit.access.{ext}" - result_files.extend(self._expand_result_files(tpl, log_ext_list)) + tpl = "output/access/log/access.{ext}" + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) if "purecn" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.{ext}" diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index ec68233e3..b7995e32c 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -60,6 +60,11 @@ class Mutect2(SnappyModel): """running time multiplier for merging""" +class CnvkitSex(enum.StrEnum): + MALE = "male" + FEMALE = "female" + + class CnvKit(SnappyModel): path_normals_list: str = "" """Optional file listing libraries to include in panel""" @@ -70,22 +75,19 @@ class CnvKit(SnappyModel): access: str = "" """Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run)""" - annotate: str = "" - """[target] Optional targets annotations""" + target_avg_size: float | None = None + """[target] Average size of split target bins (None: use default value, or use autobin for wgs)""" - target_avg_size: int = 0 - """[target] Average size of split target bins (0: use default value)""" + split: bool = False + """[target] Split large intervals into smaller ones""" - bp_per_bin: int = 50000 + bp_per_bin: float = 50000 """[autobin] Expected base per bin""" - split: bool = True - """[target] Split large intervals into smaller ones""" - - antitarget_avg_size: int = 0 + antitarget_avg_size: float = 0 """[antitarget] Average size of antitarget bins (0: use default value)""" - min_size: int = 0 + min_size: float = 0 """[antitarget] Min size of antitarget bins (0: use default value)""" min_mapq: int = 0 @@ -97,7 +99,7 @@ class CnvKit(SnappyModel): min_cluster_size: int = 0 """[reference] Minimum cluster size to keep in reference profiles. 0 for no clustering""" - gender: str = "" + sample_sex: CnvkitSex | None = None """[reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing""" male_reference: bool = False @@ -106,8 +108,8 @@ class CnvKit(SnappyModel): gc_correction: bool = True """[reference] Use GC correction""" - edge_correction: bool = True - """[reference] Use edge correction""" + edge_correction: bool | None = None + """[reference] Use edge correction (automatic when None, edge correction for WES only)""" rmask_correction: bool = True """[reference] Use rmask correction""" @@ -207,6 +209,6 @@ class PanelOfNormals(SnappyStepModel, validators.ToolsMixin): cnvkit: CnvKit | None = None - access: Access = Access() + access: Access | None = None purecn: PureCn | None = None diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index fa483d41f..5954500e9 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -1,57 +1,29 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py access""" -from snakemake.shell import shell +import os +import sys -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) -config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["access"] +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -exclude = " --exclude " + " -x ".join(config["exclude"]) if config["exclude"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" +cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ - $(if [[ {config[min_gap_size]} -gt 0 ]]; then \ - echo --min-gap-size {config[min_gap_size]} - fi) \ + --min-gap-size {snakemake.params.min_gap_size} \ {exclude} \ - {snakemake.config[static_data_config][reference][path]} - -fn=$(basename "{snakemake.output.access}") -d=$(dirname "{snakemake.output.access}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" + {snakemake.params.reference} +""".format( + snakemake=snakemake, + exclude=" ".join([f"--exclude {x}" for x in snakemake.params.exclude]) if snakemake.params.exclude else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index e79639a3b..da3b440f0 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -1,68 +1,20 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py antitarget""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -target = snakemake.input.get("target", "") - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - -if [[ -n "{config[path_target_regions]}" ]] -then - cnvkit.py antitarget \ - --output {snakemake.output.antitarget} \ - $(if [[ -n "{config[access]}" ]]; then \ - echo --access {config[access]} - fi) \ - $(if [[ {config[antitarget_avg_size]} -gt 0 ]]; then \ - echo --avg-size {config[antitarget_avg_size]} - fi) \ - $(if [[ {config[min_size]} -gt 0 ]]; then \ - echo --min-size {config[min_size]} - fi) \ - {target} -else - touch {snakemake.output.antitarget} -fi - -fn=$(basename "{snakemake.output.antitarget}") -d=$(dirname "{snakemake.output.antitarget}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +cmd = r""" +cnvkit.py antitarget \ + -o {snakemake.output.region} \ + --avg-size {snakemake.params.avg_size} --min-size {snakemake.params.min_size} \ + {access} \ + {snakemake.input.target} +""".format( + snakemake=snakemake, + access=f"--access {snakemake.params.access}" if snakemake.params.access else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index b07d4536a..678a7e348 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -1,89 +1,20 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py coverage""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -# During panel_of_normals step, the target regions are created by the target substep. -# During somatic CNV calling (both exome & wgs), the target regions are obtained from the configuration -if "target" in snakemake.input.keys(): - target = snakemake.input.target -elif "path_target" in config.keys(): - target = config["path_target"] -else: - raise Exception("Unsupported naming") - -# Same for antitarget regions -if "antitarget" in snakemake.input.keys(): - antitarget = snakemake.input.antitarget -elif "path_antitarget" in config.keys(): - antitarget = config["path_antitarget"] -else: - raise Exception("Unsupported naming") - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# Function definitions --------------------------------------------------------- - -coverage() -{{ - cnvkit.py coverage \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - --min-mapq {config[min_mapq]} \ - --processes {snakemake.threads} \ - {snakemake.input.bam} \ - --output $2 $1 -}} - -md5() -{{ - set -x - - fn=$1 - f=$(basename $fn) - d=$(dirname $fn) - pushd $d - md5sum $f > $f.md5 - popd -}} - -# ----------------------------------------------------------------------------- - -coverage {target} {snakemake.output.target} -md5 {snakemake.output.target} - -coverage {antitarget} {snakemake.output.antitarget} -md5 {snakemake.output.antitarget} -""" +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +cmd = r""" +cnvkit.py coverage --processes {snakemake.params.processes} \ + -o {snakemake.output.coverage} \ + --fasta {snakemake.params.reference} + --min-mapq {snakemake.params.min_mapq} {count} \ + {snakemake.input.bam} {snakemake.input.intervals} +""".format( + snakemake=snakemake, + count="--count" if snakemake.params.count else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 57b8ee02e..4d53e9508 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -1,90 +1,32 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py reference""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -# NOTE: snakemake.input.target and snakemake.input.antitarget contain -# the output of target & antitarget substeps when there is no bam files -# the bam files lists when the list of normals is not empty - -cluster = ( - " --cluster --min-cluster-size {}".format(config["min_cluster_size"]) - if config["min_cluster_size"] > 0 - else "" +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +cmd = r""" +cnvkit.py reference \ + -o {snakemake.output.reference} \ + --fasta {snakemake.params.reference} \ + {cluster} {min_cluster_size} \ + {sample_sex} {male_reference} {diploid_parx_genome} \ + {no_gc} {no_edge} {no_rmask} \ + {target} {antitarget} {normals} +""".format( + snakemake=snakemake, + cluster="--cluster" if snakemake.params.cluster else "", + min_cluster_size=f"--min-cluster-size {snakemake.params.min_cluster_size}" if snakemake.params.cluster and "min_cluster_size" in snakemake.params else "", + no_gc="--no-gc" if snakemake.params.no_gc else "", + no_edge="--no-edge" if snakemake.params.no_edge else "", + no_rmask="--no-rmask" if snakemake.params.no_rmask else "", + sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", + male_reference="--male-reference" if snakemake.params.male_reference else "", + diploid_parx_genome=f"--diploid_parx_genome {snakemake.params.diploid_parx_genome}" if "diploid_parx_genome" in snakemake.params else "", + target=f"--target {snakemake.input.target}" if "target" in snakemake.input else "", + antitarget=f"--antitarget {snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", + normals=" ".join(snakemake.input.normals) if "normals" in snakemake.input else "", ) -gender = " --gender {}".format(config["gender"]) if config["gender"] else "" -male = " --male-reference" if config["male_reference"] else "" -no_gc = " --no-gc" if not config["gc_correction"] else "" -no_edge = " --no-edge" if not config["edge_correction"] or not config["path_target_regions"] else "" -no_rmask = " --no-rmask" if not config["rmask_correction"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x -# ----------------------------------------------------------------------------- - -if [[ "{snakemake.params.args[flat]}" = "True" ]] -then - cnvkit.py reference \ - --output {snakemake.output.panel} \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ - --targets {snakemake.input.target} --antitargets {snakemake.input.antitarget} -else - cnvkit.py reference \ - --output {snakemake.output.panel} \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ - {snakemake.input.target} {snakemake.input.antitarget} -fi - -if [[ -n "{snakemake.input.logs}" ]] -then - tar -zcvf {snakemake.output.log} {snakemake.input.logs} -else - touch {snakemake.output.log} -fi - -fn=$(basename "{snakemake.output.panel}") -d=$(dirname "{snakemake.output.panel}") -pushd $d -md5sum $fn > $fn.md5 -popd -fn=$(basename "{snakemake.output.log}") -d=$(dirname "{snakemake.output.log}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" -) - -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index acfc41ff6..457718684 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -1,108 +1,46 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py target""" -from snakemake.shell import shell +import os +import re +import sys -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -bams = " ".join(snakemake.input.get("bams", [""])) +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi +# WGS: targets are all accessible regions, WES: targets are baits +interval = snakemake.input.access if "access" in snakemake.input else snakemake.params.target -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +if "avg_size" in snakemake.input: + pattern = re.compile("^Target:[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)$") + with open(snakemake.input.avg_size) as f: + for line in f: + m = pattern.match(line) + if m: + avg_size = float(m.groups()[4]) + break -set -x - -# ----------------------------------------------------------------------------- - -access() -{{ - cnvkit.py access \ - -o $tmpdir/access.bed \ - {snakemake.config[static_data_config][reference][path]} -}} - -# ----------------------------------------------------------------------------- - -target="{config[path_target_regions]}" -target_avg_size={config[target_avg_size]} - -if [[ -z "$target" ]] && [[ $target_avg_size -eq 0 ]] -then - tmpdir=$(mktemp -d) - - if [[ -n "{bams}" ]] - then - access - cnvkit.py autobin --method wgs \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - --access $tmpdir/access.bed \ - --bp-per-bin {config[bp_per_bin]} \ - --target-output-bed $tmpdir/target.bed --antitarget-output-bed $tmpdir/antitarget.bed \ - {bams} > $tmpdir/autobin.txt - target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) - - if [[ -z "{config[access]}" ]] - then - target=$tmpdir/access.bed - else - target="{config[access]}" - fi - else - if [[ -z "{config[access]}" ]] - then - access - target=$tmpdir/access.bed - else - target="{config[access]}" - fi - target_avg_size=5000 - fi -fi +else: + avg_size = snakemake.params.avg_size +cmd = r""" cnvkit.py target \ - --output {snakemake.output.target} \ - $(if [[ -n "{config[annotate]}" ]]; then \ - echo --short-names --annotate {config[annotate]} - fi) \ - $(if [[ "{config[split]}" = "True" ]]; then \ - echo --split - fi) \ - $(if [[ $target_avg_size -gt 0 ]]; then \ - echo --avg-size $target_avg_size - fi) \ - $target - -fn=$(basename "{snakemake.output.target}") -d=$(dirname "{snakemake.output.target}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" + -o {snakemake.output.target} \ + {avg_size} {split} \ + {interval} +""".format( + snakemake=snakemake, + interval=interval, + avg_size=f"--avg-size {avg_size}", + split=f"--split" if snakemake.params.split else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 12cf27c90..c596a734b 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -27,6 +27,8 @@ def minimal_config(): path: /path/to/cosmic.vcf.gz dbsnp: path: /path/to/dbsnp.vcf.gz + features: + path: /path/to/annotations.gtf step_config: ngs_mapping: @@ -42,13 +44,15 @@ def minimal_config(): germline_resource: /path/to/germline_resource.vcf path_normals_list: "" cnvkit: - path_target_regions: /path/to/regions.bed # WES mode + path_target_regions: "" path_normals_list: "" purecn: path_normals_list: "" path_bait_regions: /path/to/baits/regions.bed path_genomicsDB: /path/to/mutect2/genomicsDB genome_name: "unknown" + access: + exclude: [/path/to/exclude.bed] data_sets: first_batch: @@ -198,6 +202,34 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): # Tests for CnvkitStepPart ------------------------------------------------------------------------ +def test_cnvkit_step_part_get_input_files_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards) + assert actual == {} + + +def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "bams": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) + assert actual == expected + + def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): """Tests CnvkitStepPart._get_input_files_target()""" wildcards = Wildcards( @@ -205,8 +237,12 @@ def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): "mapper": "bwa", } ) + expected = { + "access": "work/bwa.cnvkit/out/cnvkit.access.bed", + "avg_size": "work/bwa.cnvkit/out/cnvkit.autobin.txt", + } actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) - assert actual == {} + assert actual == expected def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): @@ -216,11 +252,8 @@ def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): "mapper": "bwa", } ) - expected = { - "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", - } actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) - assert actual == expected + assert actual == {} def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): @@ -229,13 +262,13 @@ def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): fromdict={ "mapper": "bwa", "normal_library": "P001-N1-DNA1-WGS1", + "interval": "target", } ) expected = { "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", - "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", + "intervals": "work/bwa.cnvkit/out/cnvkit.target.bed", } actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected @@ -249,53 +282,92 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow } ) expected = { - "target": [ + "references": [ "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - "logs": [ - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_info.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_info.txt", - ], } actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_report()""" +def test_cnvkit_step_part_get_params_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_access()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = { - "target": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) + expected = {"reference": "/path/to/ref.fa"} + actual = panel_of_normals_workflow.get_params("cnvkit", "access")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_autobin()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"method": "wgs"} + actual = panel_of_normals_workflow.get_params("cnvkit", "autobin")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"annotate": "/path/to/annotations.gtf"} + actual = panel_of_normals_workflow.get_params("cnvkit", "target")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_antitarget()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"avg_size": 0, "min_size": 0} + actual = panel_of_normals_workflow.get_params("cnvkit", "antitarget")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"reference": "/path/to/ref.fa", "min_mapq": 0} + actual = panel_of_normals_workflow.get_params("cnvkit", "coverage")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_create_panel(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"reference": "/path/to/ref.fa", "no_edge": True} + actual = panel_of_normals_workflow.get_params("cnvkit", "create_panel")(wildcards) assert actual == expected def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_target()""" expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", + "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") assert actual == expected @@ -304,8 +376,8 @@ def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_antitarget()""" expected = { - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", + "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") assert actual == expected @@ -314,10 +386,8 @@ def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow) def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_coverage()""" expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", + "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") assert actual == expected @@ -328,45 +398,37 @@ def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflo expected = { "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - "log": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz", - "log_md5": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") assert actual == expected -def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_report()""" - expected = { - "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", - "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", - "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", - "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") - assert actual == expected - - def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_target()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" + base_name_out = "work/{mapper}.cnvkit/log/cnvkit.target" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.target.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") assert actual == expected def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_antitarget()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" + base_name_out = "work/{mapper}.cnvkit/log/cnvkit.antitarget" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.antitarget.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") assert actual == expected def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_coverage()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") assert actual == expected @@ -375,18 +437,12 @@ def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_create_panel()""" base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") assert actual == expected -def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_report()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") - assert actual == expected - - def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): """Tests CvnkitStepPart.get_resource_usage()""" # Define expected: default defined workflow.abstract @@ -414,12 +470,6 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): "memory": "16G", "partition": "medium", } - report_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } # Evaluate action `target` for resource, expected in target_expected_dict.items(): @@ -445,12 +495,6 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() assert actual == expected, msg_error - # Evaluate action `report` - for resource, expected in report_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'report'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource)() - assert actual == expected, msg_error - # Tests for AccessStepPart ------------------------------------------------------------------------- @@ -460,11 +504,22 @@ def test_access_step_part_get_input_files_run(panel_of_normals_workflow): assert panel_of_normals_workflow.get_input_files("access", "run") is None +def test_access_step_part_get_params_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_params_run()""" + expected = { + "reference": "/path/to/ref.fa", + "exclude": ["/path/to/exclude.bed"], + "min_gap_size": 0 + } + actual = panel_of_normals_workflow.get_params("access", "run") + assert actual == expected + + def test_access_step_part_get_output_files_run(panel_of_normals_workflow): """Tests AccessStepPart._get_output_files_run()""" expected = { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", + "access": "work/access/out/access.bed", + "access_md5": "work/access/out/access.bed.md5", } actual = panel_of_normals_workflow.get_output_files("access", "run") assert actual == expected @@ -472,7 +527,9 @@ def test_access_step_part_get_output_files_run(panel_of_normals_workflow): def test_access_step_part_get_log_file_run(panel_of_normals_workflow): """Tests AccessStepPart._get_log_file_run()""" - expected = get_expected_log_files_dict(base_out="work/cnvkit.access/log/cnvkit.access") + expected = get_expected_log_files_dict(base_out="work/access/log/access") + expected["sh"] = "work/access/log/access.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("access", "run") assert actual == expected @@ -657,39 +714,33 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += [ tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) ] - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" + tpl = "output/{mapper}.cnvkit/out/cnvkit.{substep}.{ext}" for substep in ("target", "antitarget"): expected += [ tpl.format(substep=substep, mapper=mapper, ext=ext) for ext in ("bed", "bed.md5") for mapper in ("bwa",) ] - tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" - for substep in ("sex", "metrics"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("tsv", "tsv.md5") - for mapper in ("bwa",) - ] # add log files - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" - for substep in ("target", "antitarget", "panel_of_normals", "report"): + tpl = "output/{mapper}.cnvkit/log/cnvkit.{substep}" + for substep in ("target", "antitarget"): for mapper in ("bwa",): - expected += get_expected_log_files_dict( - base_out=tpl.format(mapper=mapper, substep=substep) - ).values() - # add merged log - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{chksum}" + base_out = tpl.format(mapper=mapper, substep=substep) + expected += get_expected_log_files_dict(base_out=base_out).values() + expected += [base_out + ".sh", base_out + ".sh.md5"] + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" for mapper in ("bwa",): - for chksum in ("", ".md5"): - expected += [tpl.format(mapper=mapper, chksum=chksum)] + base_out = tpl.format(mapper=mapper, substep=substep) + expected += get_expected_log_files_dict(base_out=base_out).values() + expected += [base_out + ".sh", base_out + ".sh.md5"] # Access - tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" + tpl = "output/access/out/access.{ext}" expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] expected += get_expected_log_files_dict( - base_out="output/cnvkit.access/log/cnvkit.access" + base_out="output/access/log/access" ).values() + expected += ["output/access/log/access.sh", "output/access/log/access.sh.md5"] # PureCN tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.rds{chksum}" From b153e2ac02293318d1ba7d059177614b11a119f6 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:22:51 +0100 Subject: [PATCH 07/46] feat: CNV calling step (WIP) --- .../workflows/somatic_cnv_calling/Snakefile | 61 + .../workflows/somatic_cnv_calling/__init__.py | 1005 +++++++++++++++++ .../somatic_cnv_calling/cnvkit.rules | 279 +++++ .../workflows/somatic_cnv_calling/model.py | 508 +++++++++ 4 files changed, 1853 insertions(+) create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/Snakefile create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/__init__.py create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/model.py diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile new file mode 100644 index 000000000..23e608e34 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline somatic_cnv_calling step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_cnv_calling import ( + SomaticCnvCallingWorkflow, +) + +__author__ = "Eric Blanc " + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== + +wf = SomaticCnvCallingWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + +# Rules ======================================================================= + + +localrules: + # Linking files from work/ to output/ should be done locally + somatic_cnv_calling_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Generic linking out --------------------------------------------------------- + + +rule somatic_cnv_calling_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +# Somatic Targeted Sequencing CNV Calling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# cnvkit pipeline ------------------------------------------------------------- + + +# cnvkit requires a large number of rules, thus externalized +include: "cnvkit.rules" + diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py new file mode 100644 index 000000000..70bd2c447 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -0,0 +1,1005 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``somatic_cnv_calling`` step + +This step allows for the detection of CNV events for cancer samples from targeted sequenced (e.g., +exomes or large panels) or whole genome sequencing. +The wrapped tools start from the aligned reads (thus off ``ngs_mapping``) and generate CNV calls for somatic variants. + +The wrapped tools implement different strategies. Some work "reference free" and just use the +somatic BAM files for their input, some work in "matched cancer normal mode" and need the cancer +and normal BAM files, others again cancer BAM files, and additionally a +set of non-cancer BAM files for their background (the panel of normals). + +Some tools may also use germline & somatic variants to estimate allele-specific copy number changes, +and resolve loss-of-heterozygocity. In this case, the small variants need to be computed separately from the ``somatic_variant_calling`` step. + +Finally, some tools can use external estimation of tumor purity and ploidy. +This estimation can be either be provided in the sample sheet, or computed from the sequencing data by tools. + +========== +Step Input +========== + +Gene somatic CNV calling for targeted sequencing starts off the aligned reads, i.e., +``ngs_mapping``. + +Tools that use panel of normals can obtain their input in two different ways: + +- A static file, from another cohort or from public datasets. + In this case, the user is responsible to make sure that the data & methods used to create the panel are compatible to the cohort's. +- The ``panel_of_normals`` step. + The panel will be created if necessary, using the same conditions that for the cohort (genome release, exome kit assignment, ...) + +When requested, the optional germline and somatic small variant calls are created using a modified version of the ``somatic_variant_calling`` step. +The ``somatic__cnv_calling`` step generates the small variants (TODO: how exactly) and stores them (TODO: where exactly). + +Likewise, purity estimations can be automatically computed by the ``somatic__cnv_calling`` step, +to supplement or replace the estimations that may be provided in the samplesheet. + +=========== +Step Output +=========== + +TODO: The whole section of output needs revision. Main question is: what is the best format to encode CNAs? + +There is no widely used standard to report copy number alterations. +In absence of a better solution, all CNV tools implemented in somatic pipeline output the segmentation table loosely following the `DNAcopy format `_.` +The copy number call may or may not be present, and the chromosome number is replaced by its name. +The segmentation output is in file ``output/../out/.._dnacopy.seg``. + +:: + + output/ + +-- bwa.cnvkit.P001-N1-DNA1-WES1 + | |-- out + | | |-- bwa.cnvkitP001-N1-DNA1-WES1_dnacopy.seg + [...] + +Note that tool ``cnvetti`` doesn't follow the snappy convention above: +the tool name is followed by an underscore & the action, where the action is one of ``coverage``, ``segment`` and ``postprocess``. +For example, the output directory would contain a directory named ``bwa.cnvetti_coverage.P002-T1-DNA1-WES1``. + +.. note:: Tool-Specific Output + + Each tool produces its own set of outputs, generally not in standard format. + Some of these files are linked from ``work`` to ``output``, but not necessarily all of them. + Some tools (for example ``cnvkit``) also produces a report, with tables and figures. + + +===================== +Default Configuration +===================== + +The default configuration is as follows. + +.. include:: DEFAULT_CONFIG_somatic_targeted_seq_cnv_calling.rst + +===================================== +Available Somatic Targeted CNV Caller +===================================== + +- ``cnvkit`` (for both WGS & WES) +- ``sequenza`` (only WES) +- ``purecn`` (only WES) +- ``Control-FREEC`` (only WGS - this tools might not be supported) + +================================ +Logic of the step for ``cnvkit`` +================================ + +-------- +Overview +-------- + +``cnvkit`` was designed to call CNV on whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), +and the _antitargets_ (those regions outside of enrichment). +The coverage of _targets_ and _antitargets_ are expected to be very different, +but there is still information to be gained in the _antitarget_ regions, +albeit at a much lower resolution than for _target_ regions. + +``cnvkit`` was later used with some success on whole genome data. +WGS data was defined as _target_ regions covering the whole genome, with empty _antitarget_ regions. + +------------------------ +Sample-independent files +------------------------ + +``cvnkit`` allows the user to define _accessible_ regions (_via_ the ``access`` bed file). +This excludes repeats, low complexity or PAR regions, that cannot be properly mapped, and therefore used for CNV calling. + +For exome data, the _target_ regions are supposed to be well curated, so they are not affected by the _access_ regions. +The _antitarget_ regions, however, are only defined within _accessible_ regions. +For WGS data, the _antitarget_ regions are empty, and the _target_ regions are set to the _accessible_ regions, when present. +Even in the absence of user-defined _accessible_ regions, the _target_ and _antitarget_ regions will not contain long ``N`` sequences. + +Finally, the pipeline builds separates ``bed`` files for _target_ and _antitarget_ regions, for each exome kit present in the cohort, +and for WGS data if there is any. + +--------- +Reference +--------- + +The ``cnvkit`` authors recommend to use a panel of normals to normalize the coverage over bins. +This is usually created by running the ``panel_of_normals`` step. +The ``somatic_cnv_calling`` step will create a reference (panel of normals) if requested. +Otherwise, it is possible to use references created for different cohorts, but the user +must ensure that the data & methods used for the current cohort and to create the reference are compatible. +In particular, the exome enrichment kit must be identical, and the sex of the donors should be +similar (not to use a female-only reference for a male cohort, for example). + +If there are not enough normal samples to create such a reference, the corresponding normal sample +can be used, in a normal/tumor pair setting similar to the somatic small variant calling situation. + +In case no normals are available at all, a flat prior can be used. + +------------ +Calling CNVs +------------ + +The _target_ and _antitarget_ ``bed`` files created in the earlier sub-steps are used as input, +based on the exome kit (or WGS status). + +The coverage is computed for the tumor sample, and normalised using the reference. +As seen previously, the reference can be either exome kit-based, or sample-specific. + +The normalised coverage is the segmented, and copy numbers are called, optionally using +small variants and/or purity estimates. + +If B-allele fractions are used, the pipeline will create the small variants, only for samples +with a corresponding normal. +If purity is used, the user can choose to override the values in the sample sheet (when present) +with the output of the tool of her choice. +""" + +import os +import os.path +import re +import typing + +from biomedsheets.models import BioEntity, BioSample, TestSample, NGSLibrary +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import OutputFiles, Wildcards + +from snappy_pipeline.utils import dictify +from snappy_pipeline.workflows.abstract import ( + BaseStep, + BaseStepPart, + LinkOutStepPart, + ResourceUsage, +) +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow + +from .model import SomaticCnvCalling as SomaticCnvCallingConfigModel +from .model import Sex, LibraryKitDefinition, PanelOfNormalsOrigin + +__author__ = "Eric Blanc " + +#: Default configuration for the somatic_targeted_seq_cnv_calling step +DEFAULT_CONFIG = SomaticCnvCallingConfigModel.default_config_yaml_string() + +#: JSON key for "isCancer" +KEY_IS_CANCER = "isCancer" + +#: Value for "libraryType" is whole exome sequencing +VALUE_WES = "WES" + +#: Value for "libraryType" is panel sequencing +VALUE_PANEL = "Panel-seq" + +#: Values for targeted sequencing +VALUES_TARGETED_SEQ = (VALUE_WES, VALUE_PANEL) + +#: Standard key/extension values for BCF files +BCF_KEY_EXTS = ( + ("bcf", ".bcf"), + ("bcf_md5", ".bcf.md5"), + ("bcf_csi", ".bcf.csi"), + ("bcf_csi_md5", ".bcf.csi.md5"), +) + + +class SomaticCnvCallingStepPart(BaseStepPart): + """Shared code for all caller classes in somatic_targeted_seq_cnv_calling""" + + def __init__(self, parent: "SomaticCnvCallingWorkflow"): + super().__init__(parent) + + def _get_sample_sex(self, library_name: str) -> Sex: + if self.config.sex == Sex.MALE or self.config.sex == Sex.FEMALE: + sample_sex = self.config.sex + elif self.config.sex == Sex.SAMPLESHEET and library_name in self.parent.sex: + sample_sex = self.parent.sex[library_name] + else: + sample_sex = Sex.UNKNOWN + return sample_sex + + @staticmethod + @dictify + def _get_log_file_from_prefix(prefix: str) -> typing.Iterator[typing.Dict[str, str]]: + key_ext = ( + ("log", ".log"), + ("sh", ".sh"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + +class CnvKitStepPart(SomaticCnvCallingStepPart): + """Perform somatic targeted CNV calling using cnvkit""" + + #: Step name + name = "cnvkit" + + #: Class available actions + actions = ( + "access", + "target", + "antitarget", + "coverage", + "reference", + "flat_reference_panel", + "flat_reference_wgs", + "fix", + "segment", + "call", + "bintest", + "plot/diagram", + "plot/scatter", + "report/metrics", + "report/segmetrics", + ) + + # Overwrite defaults + default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h + + def __init__(self, parent: SomaticCnvCallingStepPart): + super().__init__(parent) + + def get_input_files(self, action: str) -> typing.Callable: + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + return getattr(self, "_get_input_files_{}".format(action.replace("/", "_"))) + + def get_params(self, action: str) -> typing.Callable: + """Return parameters input function, dependent on rule""" + # Validate action + self._validate_action(action) + return getattr(self, "_get_params_{}".format(action.replace("/", "_"))) + + def get_output_files(self, action: str) -> typing.Callable: + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + f = getattr(self, "_get_output_files_{}".format(action.replace("/", "_"))) + return f() + + def get_log_file(self, action: str) -> typing.Dict[str, str]: + """Return log files, dependent on rule""" + # Validate action + self._validate_action(action) + base_name = os.path.join("work", f"{{mapper}}.{self.name}.{{library_name}}", "log") + # Access, target & antitarget steps are cohort-wide, the others are library-dependent + if action in ("access",): + prefix = f"work/{self.name}/log/{action}" + elif action in ("target", "antitarget"): + prefix = f"work/{self.name}/log/{action}" + ".{panel_name}" + elif action in ("coverage",): + prefix = os.path.join(base_name, action + ".{region}") + elif action in ( + "reference", + "fix", + "segment", + "call", + "bintest", + "report/metrics", + "report/segmetrics", + ): + prefix = os.path.join(base_name, action.replace("/", "_")) + elif action in ("plot/diagram", "plot/scatter"): + prefix = os.path.join(base_name, action.replace("/", "_") + ".{contig_name}") + elif action == "flat_reference_panel": + prefix = f"work/{{mapper}}.{self.name}/log/reference.{{panel_name}}" + elif action == "flat_reference_wgs": + prefix = f"work/{{mapper}}.{self.name}/log/reference" + return SomaticCnvCallingStepPart._get_log_file_from_prefix(prefix) + + def get_result_files(self, library_name: str, mapper: str) -> typing.List[str]: + """Files to symlink to output""" + base_name = f"{mapper}.{self.name}.{library_name}" + result_files = [] + # Tumor samples + if library_name in self.parent.normal_library: + # Main results + prefix = os.path.join("output", base_name, "out", base_name) + for suffix in ("cnr", "segments.cns", "cns", "bintest.cnr"): + result_files.append(prefix + "." + suffix) + # Log files + prefix = os.path.join("output", base_name, "log") + for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + result_files.append(os.path.join(prefix, f"coverage.target.{ext}")) + result_files.append(os.path.join(prefix, f"coverage.antitarget.{ext}")) + for suffix in ("fix", "segment", "call", "bintest"): + for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + result_files.append(prefix + "/" + suffix + "." + ext) + # Log of reference is no panel of normals + # if not self.config[self.name]["panel_of_normals"]["enabled"]: + # normal_library = self.parent.normal_library[library_name] + # prefix = os.path.join("output", f"{mapper}.{self.name}.{normal_library}", "log", f"{mapper}.{self.name}.{normal_library}.reference") + # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + # result_files.append(prefix + "." + ext) + # Reports + if "reports" in self.config[self.name]: + prefix = os.path.join("output", base_name, "report", base_name) + for report in ("metrics", "segmetrics"): + if report in self.config[self.name]["reports"]: + result_files.append(prefix + "." + report + ".tsv") + # Plots (per chromosome) + if "plots" in self.config[self.name]: + prefix = os.path.join("output", base_name, "plot") + for plot in ("diagram", "scatter"): + if plot in self.config[self.name]["plots"]: + for contig in self.parent.contigs: + result_files.append(os.path.join(prefix, plot, contig + ".png")) + # else: # Normal samples + # prefix = os.path.join("output", base_name, "log", "reference") + # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + # result_files.append(prefix + "." + ext) + return result_files + + # ----- Access -------------------------------------------------------------------------------- + + def _get_input_files_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return None + + def _get_params_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: + params = {"reference": self.w_config.static_data_config.reference.path} + params["min_gap_size"] = self.config[self.name]["access"]["min_gap_size"] + access = self.config[self.name]["access"].get("exclude", None) + if access: + params["access"] = access + + def _get_output_files_access(self) -> typing.Dict[str, str]: + return {"access": f"work/{self.name}/out/access.bed"} + + # ----- Target -------------------------------------------------------------------------------- + + def _get_input_files_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: + for panel in self.config.path_target_interval_list_mapping: + if panel.name == wildcards.panel_name: + return {"region": panel.path} + + def _get_params_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "split": self.config[self.name]["target"]["split"], + "avg_size": self.config[self.name]["target"]["avg_size"], + } + + def _get_output_files_target(self) -> typing.Dict[str, str]: + return {"region": f"work/{self.name}/out/{{panel_name}}_target.bed"} + + # ----- Antitarget ---------------------------------------------------------------------------- + + def _get_input_files_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # No antitarget for WGS + return { + "target": f"work/{self.name}/out/{wildcards.panel_name}_target.bed", + "access": f"work/{self.name}/out/access.bed", + } + + def _get_params_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "avg_size": self.config[self.name]["antitarget"]["avg_size"], + "min_size": self.config[self.name]["antitarget"]["min_size"], + } + + def _get_output_files_antitarget(self) -> typing.Dict[str, str]: + return {"region": f"work/{self.name}/out/{{panel_name}}_antitarget.bed"} + + # ----- Coverage ------------------------------------------------------------------------------ + + def _get_input_files_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # BAM/BAI file + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) + input_files = { + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), + } + + # Region (target or antitarget) file + panel = self.parent.libraryKit.get(wildcards.library_name, None) + if panel is None: + input_files["region"] = f"work/{self.name}/out/access.bed" + else: + input_files["region"] = f"work/{self.name}/out/{panel.name}_{wildcards.region}.bed" + return input_files + + def _get_params_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "fasta": self.w_config.static_data_config.reference.path, + "count": self.config[self.name]["coverage"]["count"], + "min_mapq": self.config[self.name]["coverage"]["min_mapq"], + "processes": self.default_resource_usage.threads, + } + + def _get_output_files_coverage(self) -> typing.Dict[str, str]: + return {"coverage": f"work/{{mapper}}.{self.name}.{{library_name}}/out/{{region}}.cnn"} + + # ----- Reference ----------------------------------------------------------------------------- + + def _get_input_files_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: + """Builds reference from the paired normal, or flat prior in absence of normal""" + input_files = {} + normal_library = self.parent.normal_library.get(wildcards.library_name, None) + input_files["normals"] = [ + f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/target.cnn", + f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/antitarget.cnn", + ] + return input_files + + def _get_params_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: + params = { + "fasta": self.w_config.static_data_config.reference.path, + "cluster": self.config[self.name]["reference"]["cluster"], + "min_cluster_size": self.config[self.name]["reference"]["min_cluster_size"], + "male_reference": self.config[self.name]["use_male_reference"], + "no_gc": self.config[self.name]["reference"]["no_gc"], + "no_edge": self.config[self.name]["reference"]["no_edge"], + "no_rmask": self.config[self.name]["reference"]["no_rmask"], + } + sample_sex = self._get_sample_sex(wildcards.library_name) + if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: + params["sample_sex"] = str(sample_sex) + return + + def _get_output_files_reference(self) -> typing.Dict[str, str]: + """TODO: flat prior reference should be library-independent""" + return {"reference": f"work/{{mapper}}.{self.name}.{{library_name}}/out/reference.cnn"} + + def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: + """Builds reference from the paired normal, or flat prior in absence of normal""" + input_files = {} + panel = self.parent.libraryKit.get(wildcards.library_name, None) + if panel is None: # WGS, target is access, no antitarget + input_files["target"] = f"work/{self.name}/out/access.bed" + else: # WES, both target & antitarget + input_files["target"] = f"work/{self.name}/out/{panel.name}_target.bed" + input_files["antitarget"] = f"work/{self.name}/out/{panel.name}_antitarget.bed" + return input_files + + def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_input_files_flat_reference_panel(wildcards) + + def _get_params_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_params_reference(wildcards) + + def _get_output_files_flat_reference_panel(self) -> typing.Dict[str, str]: + """TODO: flat prior reference should be library-independent""" + return {"reference": f"work/{{mapper}}.{self.name}/out/reference.{{panel_name}}.cnn"} + + def _get_input_files_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_input_files_flat_reference_panel(wildcards) + + def _get_params_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_params_reference(wildcards) + + def _get_output_files_flat_reference_wgs(self) -> typing.Dict[str, str]: + """TODO: flat prior reference should be library-independent""" + return {"reference": f"work/{{mapper}}.{self.name}/out/reference.cnn"} + + # ----- Fix ----------------------------------------------------------------------------------- + + def _get_input_files_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Coverage on targets + input_files = { + "target": f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/target.cnn" + } + # Coverage on antitargets when present (absent for WGS) + panel = self.parent.libraryKit.get(wildcards.library_name, None) + if panel is not None: # WGS - no antitarget + input_files["antitarget"] = ( + f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/antitarget.cnn" + ) + # Get reference from panel of normals if available, otherwise from normal or flat when no normal + if not self.config[self.name]["panel_of_normals"]["enabled"]: # Paired normal or flat + normal_library = self.parent.normal_library.get(wildcards.library_name, None) + if normal_library: + input_files["reference"] = ( + f"work/{{mapper}}.{self.name}.{normal_library}/out/reference.cnn" + ) + else: + if panel: + input_files["reference"] = ( + f"work/{{mapper}}.{self.name}/out/reference.{panel.name}.cnn" + ) + else: + input_files["reference"] = f"work/{{mapper}}.{self.name}/out/reference.cnn" + elif ( + self.config[self.name]["panel_of_normals"]["origin"] + == PanelOfNormalsOrigin.PREVIOUS_STEP + ): # Panel_of_normals step + input_files["reference"] = self.parent._get_panel_of_normals_path(self.name, panel) + else: + input_files["reference"] = self.config[self.name]["panel_of_normals"][ + "path_panel_of_normals" + ] + return input_files + + def _get_params_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "sample_id": wildcards.library_name, + "cluster": self.config[self.name]["fix"]["cluster"], + "no_gc": self.config[self.name]["fix"]["no_gc"], + "no_edge": self.config[self.name]["fix"]["no_edge"], + "no_rmask": self.config[self.name]["fix"]["no_rmask"], + } + + def _get_output_files_fix(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"coverage": os.path.join("work", base_name, "out", base_name + ".cnr")} + + # ----- Segment ------------------------------------------------------------------------------- + + def _get_input_files_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Coverage + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = {"coverage": f"work/{base_name}/out/{base_name}.cnr"} + # Segmentation using SNVs if requested and available (normal must be present) + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + return input_files + + def _get_params_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Segmentation parameters + params = { + "method": self.config[self.name]["segment"]["method"], + "threshold": self.config[self.name]["segment"]["threshold"], + "drop_low_coverage": self.config[self.name]["segment"]["drop_low_coverage"], + "drop_outliers": self.config[self.name]["segment"]["drop_outliers"], + } + if self.config[self.name]["segment"]["method"] == "cbs": + params["smooth_cbs"] = self.config[self.name]["segment"]["smooth_cbs"] + params["processes"] = self.default_resource_usage.threads + # Normal & tumor sample ids if SNVs + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + params["sample_id"] = wildcards.library_name + params["normal_id"] = self.normal_library[wildcards.library_name] + params["min_variant_depth"] = self.config[self.name]["segment"]["min_variant_depth"] + params["zygocity_freq"] = self.config[self.name]["segment"]["zygocity_freq"] + return params + + def _get_output_files_segment(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return { + "segments": os.path.join("work", base_name, "out", base_name + ".segments.cns"), + "dataframe": os.path.join("work", base_name, "out", "dataframe.rds"), + } + + # ----- Call ---------------------------------------------------------------------------------- + + def _get_input_files_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Segmentation + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = {"segments": f"work/{base_name}/out/{base_name}.segments.cns"} + # SNVs if requested and available (normal must be present) + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + # Purity from the tool if requested and not from the samplesheet + if ( + self.config[self.name]["purity"]["enabled"] and self.config[self.name]["purity"]["tool"] + ): # Need purity, and can use tool to obain it + if ( + self.config[self.name]["purity"]["ignore_samplesheet"] + or wildcards.library_name not in self.parent.purity + ): + # Don't use samplesheet + input_files["purity"] = ( + f"work/{base_name}/out/{wildcards.mapper}.{self.config.purity.tool}.txt" + ) + return input_files + + def _get_params_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Call parameters + params = { + "method": self.config[self.name]["call"]["method"], + "thresholds": self.config[self.name]["call"]["thresholds"], + "filter": self.config[self.name]["call"]["filter"], + "drop_low_coverage": self.config[self.name]["call"]["drop_low_coverage"], + "male_reference": self.config[self.name]["use_male_reference"], + } + # If center_at defined, use it, otherwise use the center method + center = self.config[self.name]["call"].get("center_at", None) + if center is not None: + params["center_at"] = center + else: + params["center"] = self.config[self.name]["call"].get("center", "None") + # Normal & tumor sample ids if SNVs + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + params["sample_id"] = wildcards.library_name + params["normal_id"] = self.normal_library[wildcards.library_name] + # Sample sex if known, otherwise guessed by the tool + sample_sex = self._get_sample_sex(wildcards.library_name) + if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: + params["sample_sex"] = sample_sex + # If requested, purity from samplesheet or from default if no tool + if self.config[self.name]["purity"]["enabled"]: + purity = self.parent.purity.get( + wildcards.library_name, self.config.purity.default_purity + ) + if purity is not None and not self.config[self.name]["purity"]["ignore_samplesheet"]: + params["purity"] = purity + if self.config.default_ploidy: + params["ploidy"] = self.config.default_ploidy + return params + + def _get_output_files_call(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"calls": os.path.join("work", base_name, "out", base_name + ".cns")} + + # ----- Bintest ------------------------------------------------------------------------------- + + def _get_input_files_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + return { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + + def _get_params_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "alpha": self.config[self.name]["bintest"]["alpha"], + "target": self.config[self.name]["bintest"]["target"], + } + + def _get_output_files_bintest(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"coverage": os.path.join("work", base_name, "out", base_name + ".bintest.cnr")} + + # ----- Plots -------------------------------------------------------------------------------- + + def _get_input_files_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + return input_files + + def _get_params_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "threshold": self.config[self.name]["plots"]["diagram"]["threshold"], + "min_probes": self.config[self.name]["plots"]["diagram"]["min_probes"], + "no_shift_xy": self.config[self.name]["plots"]["diagram"]["no_shift_xy"], + } + + def _get_output_files_plot_diagram(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"figure": os.path.join("work", base_name, "plot", "diagram", "{contig_name}.pdf")} + + def _get_input_files_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + return input_files + + def _get_params_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: + params = { + "chromosome": wildcards.contig_name, + "antitarget_marker": self.config[self.name]["plots"]["scatter"]["antitarget_marker"], + "by_bin": self.config[self.name]["plots"]["scatter"]["by_bin"], + "segment_color": self.config[self.name]["plots"]["scatter"]["segment_color"], + "trend": self.config[self.name]["plots"]["scatter"]["trend"], + "y_max": self.config[self.name]["plots"]["scatter"]["y_max"], + "y_min": self.config[self.name]["plots"]["scatter"]["y_min"], + "fig_size": self.config[self.name]["plots"]["scatter"]["fig_size"], + "sample_id": wildcards.library_name, + } + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + params["normal_id"] = self.normal_library[wildcards.library_name] + params["min_variant_depth"] = self.config[self.name]["plots"]["scatter"][ + "min_variant_depth" + ] + params["zygocity_freq"] = self.config[self.name]["plots"]["scatter"]["zygocity_freq"] + return params + + def _get_output_files_plot_scatter(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"figure": os.path.join("work", base_name, "plot", "scatter", "{contig_name}.pdf")} + + # ----- Metrics (metrics & segmetrics) -------------------------------------------------------- + + def _get_input_files_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + return { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + + def _get_params_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return {"drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"]} + + def _get_output_files_report_metrics(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"report": os.path.join("work", base_name, "report", base_name + ".metrics.tsv")} + + def _get_input_files_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + return { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + + def _get_params_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"], + "stats": ( + "mean", + "median", + "mode", + "t-test", + "stdev", + "sem", + "mad", + "mse", + "iqr", + "bivar", + "ci", + "pi", + ), + "alpha": self.config[self.name]["reports"]["alpha"], + "bootstrap": self.config[self.name]["reports"]["bootstrap"], + } + + def _get_output_files_report_segmetrics(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"report": os.path.join("work", base_name, "report", base_name + ".segmetrics.tsv")} + + +class SomaticCnvCallingWorkflow(BaseStep): + """Perform somatic targeted sequencing CNV calling""" + + #: Workflow name + name = "somatic_cnv_calling" + + #: Default biomed sheet class + sheet_shortcut_class = CancerCaseSheet + + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=False) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + config_model_class=SomaticCnvCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes( + ( + CnvKitStepPart, + # ControlfreecStepPart, + # SequenzaStepPart, + # PureCNStepPart, + LinkOutStepPart, + ) + ) + # Initialize sub-workflows + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) + self.registered_pons = self._optionally_register_pon() + + # Collect extra information per library + self.normal_library = self._get_normal_library() + self.libraryKit = self._get_panel_information() + self.sex = self._get_sex() + self.purity = self._get_purity() + + def get_result_files(self) -> OutputFiles: + fns = [] + for seq_type, tools in self.config.tools: + for library in self._get_libraries(): + if library.extra_infos.get("libraryType", "").lower() != seq_type: + continue + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + bio_sample = test_sample.bio_sample + is_tumor = bio_sample.extra_infos.get("isTumor", True) + if is_tumor: + for tool in tools: + f = self.substep_getattr(tool, "get_result_files") + for mapper in self.w_config.step_config["ngs_mapping"]["tools"]["dna"]: + for fn in f(library.name, mapper): + fns.append(fn) + return OutputFiles(fns) + + def _get_libraries(self) -> typing.Iterator[NGSLibrary]: + for sheet in self.shortcut_sheets: + for donor in sheet.sheet.bio_entities.values(): + for bio_sample in donor.bio_samples.values(): + for test_sample in bio_sample.test_samples.values(): + for library in test_sample.ngs_libraries.values(): + yield library + + def _get_normal_library(self) -> typing.Dict[str, str]: + normal_for_donor = {} + for library in self._get_libraries(): + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + bio_sample = test_sample.bio_sample + is_tumor = bio_sample.extra_infos.get("isTumor", None) + if is_tumor is None: + raise ValueError(f"Missing 'isTumor' value for library '{library.name}'") + if is_tumor: + continue + donor = bio_sample.bio_entity + if donor.name in normal_for_donor: + raise ValueError(f"Multiple normals for donor '{donor.name}'") + normal_for_donor[donor.name] = library.name + + normal_library = {} + for library in self._get_libraries(): + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + bio_sample = test_sample.bio_sample + donor = bio_sample.bio_entity + if bio_sample.extra_infos.get("isTumor", True): + normal_library[library.name] = normal_for_donor[donor.name] + return normal_library + + def _optionally_register_pon(self) -> typing.Dict[str, str]: + """ + Register all possible combination of panel of normals: + - WGS PON for all configured WGS tools which require/can use it + - WES PON for all configured WES tools which require/can use it, one for each enrichment kit + + Note that there is no need to specify the genome release, + because the panel_of_normals step used here MUST be in the same project, + so it has the same configuration, and only one genome release is allowed per configuration. + """ + registered_pons = list() + for tool in self.config.tools.wgs: + pon_name = f"wgs.{tool}" + if pon_name in registered_pons: + continue + if self.config[tool].get("panel_of_normals", None) and self.config[ + tool + ].panel_of_normals.get("path_panel_of_normals_step", None): + self.register_sub_workflow( + "panel_of_normals", + self.config[tool].panel_of_normals.path_panel_of_normals_step, + pon_name, + ) + registered_pons.append(pon_name) + for tool in self.config.tools.wes: + for panel in self.config.path_target_interval_list_mapping: + pon_name = f"wes.{tool}.{panel.name}" + if pon_name in registered_pons: + continue + if self.config[tool].get("panel_of_normals", None) and self.config[ + tool + ].panel_of_normals.get("path_panel_of_normals_step", None): + self.register_sub_workflow( + "panel_of_normals", + self.config[tool].panel_of_normals.path_panel_of_normals_step, + pon_name, + ) + registered_pons.append(pon_name) + return registered_pons + + def _get_panel_information(self) -> typing.Dict[str, str]: + # Set default panel + default = None + for panel in self.config.path_target_interval_list_mapping: + if panel.name == "__default__": + default = panel + break + + # Extract library pattern (the "libraryKit" column in samplesheet) + # On output: + # - the panel name and panel path if libraryKit is present & known + # - the default panel path if libraryKit is undefined or not found + # - None for WGS + # - ValueError if libraryType is missing or unknown (not WES nor WGS) + libraryKit = {} + for library in self._get_libraries(): + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + + libraryType = library.extra_infos.get("libraryType", None) + if libraryType is None: + raise ValueError(f"Missing library type for library '{library.name}'") + elif libraryType == "WES": + if library.extra_infos.get("libraryKit", None): + for panel in self.config.path_target_interval_list_mapping: + if re.match(panel.pattern, library.extra_infos.get("libraryKit")): + libraryKit[library.name] = panel + break + if library.name not in libraryKit: + libraryKit[library.name] = default + else: + libraryKit[library.name] = default + if libraryKit[library.name] is None: + raise ValueError(f"Undefined panel for library '{library.name}") + elif libraryType == "WGS": + libraryKit[library.name] = None + else: + raise ValueError( + f"Unknown library type '{libraryType}' for library '{library.name}'" + ) + + return libraryKit + + def _get_purity(self) -> typing.Dict[str, str]: + """Returns the purity value from the 'purity' library extra_infos. Missing otherwise""" + purity = {} + for library in self._get_libraries(): + p = library.extra_infos.get("purity", None) + if p: + try: + p = float(p) + if 0 <= p and p <= 1: + purity[library.name] = p + except: + pass + return purity + + def _get_sex(self) -> typing.Dict[str, Sex]: + sex = {} + for library in self._get_libraries(): + donor = library.test_sample.bio_sample.bio_entity + donor_sex = donor.extra_infos.get("sex", None) + if donor_sex == "male": + donor_sex = Sex.MALE + elif donor_sex == "female": + donor_sex = Sex.FEMALE + else: + donor_sex = Sex.UNKNOWN + sex[library.name] = donor_sex + return sex + + def _get_panel_of_normals_path(self, tool: str, panel: LibraryKitDefinition | None) -> str: + pon_path = None + assert self.config[tool]["panel_of_normals"][ + "enabled" + ], f"Panel of normals not enabled for '{tool}'" + assert ( + self.config[tool]["panel_of_normals"]["origin"] == PanelOfNormalsOrigin.PREVIOUS_STEP + ), f"'{tool}' panel of normals not from previous step" + if panel is None: + pon_id = f"wgs.{tool}" + else: + pon_id = f"wes.{tool}.{panel.name}" + assert pon_id in self.registered_pons, f"Requested panel '{pon_id}' not registered" + pon = self.parent.sub_workflows[pon_id] + pon_path = pon(f"output/{{mapper}}.{tool}/out/{panel.name}.ext") + return pon_path diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules new file mode 100644 index 000000000..8ec5d13fe --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules @@ -0,0 +1,279 @@ +rule somatic_targeted_seq_cnv_calling_cnvkit_access: + output: + **wf.get_output_files("cnvkit", "access"), + params: + wf.get_params("cnvkit", "access"), + log: + **wf.get_log_file("cnvkit", "access"), + threads: wf.get_resource("cnvkit", "access", "threads") + resources: + time=wf.get_resource("cnvkit", "access", "time"), + memory=wf.get_resource("cnvkit", "access", "memory"), + partition=wf.get_resource("cnvkit", "access", "partition"), + tmpdir=wf.get_resource("cnvkit", "access", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/access") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_target: + input: + unpack(wf.get_input_files("cnvkit", "target")), + params: + wf.get_params("cnvkit", "target"), + output: + **wf.get_output_files("cnvkit", "target"), + log: + **wf.get_log_file("cnvkit", "target"), + threads: wf.get_resource("cnvkit", "target", "threads") + resources: + time=wf.get_resource("cnvkit", "target", "time"), + memory=wf.get_resource("cnvkit", "target", "memory"), + partition=wf.get_resource("cnvkit", "target", "partition"), + tmpdir=wf.get_resource("cnvkit", "target", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/target") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: + input: + unpack(wf.get_input_files("cnvkit", "antitarget")), + params: + wf.get_params("cnvkit", "antitarget"), + output: + **wf.get_output_files("cnvkit", "antitarget"), + log: + **wf.get_log_file("cnvkit", "antitarget"), + threads: wf.get_resource("cnvkit", "antitarget", "threads") + resources: + time=wf.get_resource("cnvkit", "antitarget", "time"), + memory=wf.get_resource("cnvkit", "antitarget", "memory"), + partition=wf.get_resource("cnvkit", "antitarget", "partition"), + tmpdir=wf.get_resource("cnvkit", "antitarget", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/antitarget") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: + input: + unpack(wf.get_input_files("cnvkit", "coverage")), + params: + wf.get_params("cnvkit", "coverage"), + output: + **wf.get_output_files("cnvkit", "coverage"), + log: + **wf.get_log_file("cnvkit", "coverage"), + threads: wf.get_resource("cnvkit", "coverage", "threads") + resources: + time=wf.get_resource("cnvkit", "coverage", "time"), + memory=wf.get_resource("cnvkit", "coverage", "memory"), + partition=wf.get_resource("cnvkit", "coverage", "partition"), + tmpdir=wf.get_resource("cnvkit", "coverage", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/coverage") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_reference: + input: + unpack(wf.get_input_files("cnvkit", "reference")), + params: + wf.get_params("cnvkit", "reference"), + output: + **wf.get_output_files("cnvkit", "reference"), + log: + **wf.get_log_file("cnvkit", "reference"), + threads: wf.get_resource("cnvkit", "reference", "threads") + resources: + time=wf.get_resource("cnvkit", "reference", "time"), + memory=wf.get_resource("cnvkit", "reference", "memory"), + partition=wf.get_resource("cnvkit", "reference", "partition"), + tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/reference") + + +# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_panel: +# input: +# unpack(wf.get_input_files("cnvkit", "flat_reference_panel")), +# params: +# wf.get_params("cnvkit", "reference"), +# output: +# **wf.get_output_files("cnvkit", "flat_reference_panel"), +# log: +# **wf.get_log_file("cnvkit", "reference"), +# threads: wf.get_resource("cnvkit", "reference", "threads") +# resources: +# time=wf.get_resource("cnvkit", "reference", "time"), +# memory=wf.get_resource("cnvkit", "reference", "memory"), +# partition=wf.get_resource("cnvkit", "reference", "partition"), +# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), +# wrapper: +# wf.wrapper_path("cnvkit/reference") + + +# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_wgs: +# input: +# unpack(wf.get_input_files("cnvkit", "flat_reference_wgs")), +# params: +# wf.get_params("cnvkit", "reference"), +# output: +# **wf.get_output_files("cnvkit", "flat_reference_wgs"), +# log: +# **wf.get_log_file("cnvkit", "reference"), +# threads: wf.get_resource("cnvkit", "reference", "threads") +# resources: +# time=wf.get_resource("cnvkit", "reference", "time"), +# memory=wf.get_resource("cnvkit", "reference", "memory"), +# partition=wf.get_resource("cnvkit", "reference", "partition"), +# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), +# wrapper: +# wf.wrapper_path("cnvkit/reference") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_fix: + input: + unpack(wf.get_input_files("cnvkit", "fix")), + params: + wf.get_params("cnvkit", "fix"), + output: + **wf.get_output_files("cnvkit", "fix"), + log: + **wf.get_log_file("cnvkit", "fix"), + threads: wf.get_resource("cnvkit", "fix", "threads") + resources: + time=wf.get_resource("cnvkit", "fix", "time"), + memory=wf.get_resource("cnvkit", "fix", "memory"), + partition=wf.get_resource("cnvkit", "fix", "partition"), + tmpdir=wf.get_resource("cnvkit", "fix", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/fix") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_segment: + input: + unpack(wf.get_input_files("cnvkit", "segment")), + params: + wf.get_params("cnvkit", "segment"), + output: + **wf.get_output_files("cnvkit", "segment"), + log: + **wf.get_log_file("cnvkit", "segment"), + threads: wf.get_resource("cnvkit", "segment", "threads") + resources: + time=wf.get_resource("cnvkit", "segment", "time"), + memory=wf.get_resource("cnvkit", "segment", "memory"), + partition=wf.get_resource("cnvkit", "segment", "partition"), + tmpdir=wf.get_resource("cnvkit", "segment", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/segment") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_call: + input: + unpack(wf.get_input_files("cnvkit", "call")), + params: + wf.get_params("cnvkit", "call"), + output: + **wf.get_output_files("cnvkit", "call"), + log: + **wf.get_log_file("cnvkit", "call"), + threads: wf.get_resource("cnvkit", "call", "threads") + resources: + time=wf.get_resource("cnvkit", "call", "time"), + memory=wf.get_resource("cnvkit", "call", "memory"), + partition=wf.get_resource("cnvkit", "call", "partition"), + tmpdir=wf.get_resource("cnvkit", "call", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/call") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_bintest: + output: + **wf.get_output_files("cnvkit", "bintest"), + params: + wf.get_params("cnvkit", "bintest"), + log: + **wf.get_log_file("cnvkit", "bintest"), + threads: wf.get_resource("cnvkit", "bintest", "threads") + resources: + time=wf.get_resource("cnvkit", "bintest", "time"), + memory=wf.get_resource("cnvkit", "bintest", "memory"), + partition=wf.get_resource("cnvkit", "bintest", "partition"), + tmpdir=wf.get_resource("cnvkit", "bintest", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/bintest") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_plot_diagram: + input: + unpack(wf.get_input_files("cnvkit", "plot/diagram")), + params: + wf.get_params("cnvkit", "plot/diagram"), + output: + **wf.get_output_files("cnvkit", "plot/diagram"), + log: + **wf.get_log_file("cnvkit", "plot/diagram"), + threads: wf.get_resource("cnvkit", "plot/diagram", "threads") + resources: + time=wf.get_resource("cnvkit", "plot/diagram", "time"), + memory=wf.get_resource("cnvkit", "plot/diagram", "memory"), + partition=wf.get_resource("cnvkit", "plot/diagram", "partition"), + tmpdir=wf.get_resource("cnvkit", "plot/diagram", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/plot/diagram") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_plot_scatter: + input: + unpack(wf.get_input_files("cnvkit", "plot/scatter")), + params: + wf.get_params("cnvkit", "plot/scatter"), + output: + **wf.get_output_files("cnvkit", "plot/scatter"), + log: + **wf.get_log_file("cnvkit", "plot/scatter"), + threads: wf.get_resource("cnvkit", "plot/scatter", "threads") + resources: + time=wf.get_resource("cnvkit", "plot/scatter", "time"), + memory=wf.get_resource("cnvkit", "plot/scatter", "memory"), + partition=wf.get_resource("cnvkit", "plot/scatter", "partition"), + tmpdir=wf.get_resource("cnvkit", "plot/scatter", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/plot/scatter") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_report_metrics: + input: + unpack(wf.get_input_files("cnvkit", "report/metrics")), + params: + wf.get_params("cnvkit", "report/metrics"), + output: + **wf.get_output_files("cnvkit", "report/metrics"), + log: + **wf.get_log_file("cnvkit", "report/metrics"), + threads: wf.get_resource("cnvkit", "report/metrics", "threads") + resources: + time=wf.get_resource("cnvkit", "report/metrics", "time"), + memory=wf.get_resource("cnvkit", "report/metrics", "memory"), + partition=wf.get_resource("cnvkit", "report/metrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "report/metrics", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/report/metrics") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_report_segmetrics: + input: + unpack(wf.get_input_files("cnvkit", "report/segmetrics")), + params: + wf.get_params("cnvkit", "report/segmetrics"), + output: + **wf.get_output_files("cnvkit", "report/segmetrics"), + log: + **wf.get_log_file("cnvkit", "report/segmetrics"), + threads: wf.get_resource("cnvkit", "report/segmetrics", "threads") + resources: + time=wf.get_resource("cnvkit", "report/segmetrics", "time"), + memory=wf.get_resource("cnvkit", "report/segmetrics", "memory"), + partition=wf.get_resource("cnvkit", "report/segmetrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "report/segmetrics", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/report/segmetrics") diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py new file mode 100644 index 000000000..ef10a9983 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -0,0 +1,508 @@ +import enum +import typing +from typing import Annotated + +from pydantic import Field, model_validator # , validator + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, Parallel + + +class WgsCaller(enum.StrEnum): + CNVKIT = "cnvkit" + CONTROL_FREEC = "control_freec" + + +class WesCaller(enum.StrEnum): + CNVKIT = "cnvkit" + PURECN = "purecn" + SEQUENZA = "sequenza" + + +class Tools(SnappyModel): + wgs: Annotated[typing.List[WgsCaller], EnumField(WgsCaller, [])] + """WGS calling tools""" + + wes: Annotated[typing.List[WesCaller], EnumField(WesCaller, [])] + """WES calling tools""" + + +class Sex(enum.StrEnum): + SAMPLESHEET = "samplesheet" + """Obtain the sex from the samplesheet""" + DIPLOID_ONLY = "diploid_only" + """Compute CNV for diploid chromosomes only""" + AUTO = "auto" + """Automatic sex detection using X/Y coverage""" + FEMALE = "female" + """Assume all samples are female""" + MALE = "male" + """Assume all samples are male""" + UNKNOWN = "unknown" + """Sex is unknown""" + + +class SequencingMethod(enum.StrEnum): + WES = "hybrid" + PANEL = "amplicon" + WGS = "wgs" + + +class LibraryKitDefinition(SnappyModel): + """ + Mapping from enrichment kit to target region BED file, for either computing per--target + region coverage or selecting targeted exons. + + The following will match both the stock IDT library kit and the ones + with spike-ins seen fromr Yale genomics. The path above would be + mapped to the name "default". + - name: IDT_xGen_V1_0 + pattern: "xGen Exome Research Panel V1\\.0*" + path: "path/to/targets.bed" + """ + + name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] + + pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] + + path: Annotated[str, Field(examples=["path/to/targets.bed"])] + + +class PanelOfNormalsOrigin(enum.StrEnum): + PREVIOUS_STEP = "previous_step" + """Use (& possibly create) a panel of normals from the current cohort of normals in the panel_of_normals step""" + STATIC = "static" + """Use an panel of normals from another cohort or from public data""" + + +class PanelOfNormals(SnappyModel): + enabled: bool = False + origin: PanelOfNormalsOrigin = PanelOfNormalsOrigin.PREVIOUS_STEP + path_panel_of_normals: str = "../panel_of_normals" + """ + Path to panel of normals created in current project + + The panel of normals can be either a file (typically from another project), + or from the current project's panel_of_normals step. + + In the latter case, the missing one(s) (in case there are more than one panel, or if there are WES & WGS) + will be created when not present. + The matching of genome release & exome baits is done on genome name & exome baits md5 checksum. + These are computed in the panel of normals step, and saved with the panel itself. + + There is no such matching if a panel of normal file is provided. The panel of normals validity is left to the user. + """ + + +class Mutect2(Parallel): + panel_of_normals: PanelOfNormals | None = None + """ + Panel of normals created by the PanelOfNormals program. + """ + + germline_resource: str + + common_variants: str | None = "" + """Common germline variants for contamination estimation""" + + arguments_for_purecn: bool = True + """ + PureCN requires that Mutect2 be called with arguments: + --genotype-germline-sites true --genotype-pon-sites true + """ + + extra_arguments: Annotated[ + typing.List[str], + # AfterValidator(argument), + Field( + examples=[ + "--read-filter CigarContainsNoNOperator", + "--annotation AssemblyComplexity BaseQuality", + ] + ), + ] = [] + """ + List additional Mutect2 arguments. + Each additional argument must be of the form: + "-- " + For example, to filter reads prior to calling & to add annotations to the output vcf: + - "--read-filter CigarContainsNoNOperator" + - "--annotation AssemblyComplexity BaseQuality" + """ + + window_length: int = 300000000 + + +class VariantTool(enum.StrEnum): + MUTECT2 = "mutect2" + + +class Variant(SnappyModel): + enabled: bool = False + tool: VariantTool | None = None + + mutect2: Mutect2 | None = None + + +class Ascat(SnappyModel): + pass + """TODO: configure purity tools (except for PureCN)""" + + +class Sequenza(SnappyModel): + pass + + +class ControlFreec(SnappyModel): + pass + + +class PureCn(SnappyModel): + panel_of_normals: PanelOfNormals + """ + Panel of normals created by the NormalDB.R script. + This is required even if the normal/tumor paired mode won't use it. + """ + + variants: VariantTool + + mappability: str = "" + """ + GRCh38: + /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw + """ + + reptiming: str = "" + """Nothing for GRCh38""" + + seed: int = 1234567 + extra_commands: typing.Dict[str, typing.Any] = { + "model": "betabin", + "fun-segmentation": "PSCBS", + "post-optimize": "", + } + """Recommended extra arguments for PureCN, extra_commands: {} to clear them all""" + + path_container: Annotated[ + str, Field(examples=["../panel_of_normals/work/containers/out/purecn.simg"]) + ] + """Conda installation not working well, container is required""" + + path_intervals: Annotated[ + str, + Field( + examples=[ + "../panel_of_normals/output/purecn/out/_.list" + ] + ), + ] + + +class PurityTool(enum.StrEnum): + ASCAT = "ascat" + PURECN = "purecn" + + +class Purity(SnappyModel): + enabled: bool = False + + ignore_samplesheet: bool = False + """Discard purity values in samplesheet when they exist""" + default_value: float | None = None + """Purity value for all samples""" + + tool: PurityTool | None = None + """Tool used for purity estimation, if not set, try samplesheet, otherwise default_value""" + + ascat: Ascat | None = None + + +class CnvkitSegmentationMethod(enum.StrEnum): + CBS = "cbs" + FLASSO = "flasso" + HAAR = "haar" + HMM = "hmm" + HMM_TUMOR = "hmm-tumor" + HMM_GERMLINE = "hmm-germline" + NONE = "none" + + +class CnvkitCallingMethod(enum.StrEnum): + THRESHOLD = "threshold" + CLONAL = "clonal" + NONE = "none" + + +class CnvkitCenterMethod(enum.StrEnum): + MEAN = "mean" + MEDIAN = "median" + MODE = "mode" + BIWEIGHT = "biweight" + + +class CnvkitFilterMethod(enum.StrEnum): + AMPDEL = "ampdel" + CN = "cn" + CI = "ci" + SEM = "sem" + + +class CnvkitAccess(SnappyModel): + exclude: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/cnvkit/access-5k-mappable.grch37.bed" + ] + ), + ] = None + """Regions accessible to mapping""" + + min_gap_size: int = 5000 + """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" + + +class CnvkitTarget(SnappyModel): + split: bool = False + """Split large tiled intervals into smaller, consecutive targets.""" + avg_size: float = 800 / 3 + """Average size of split target bins (results are approximate)""" + + +class CnvkitAntitarget(SnappyModel): + avg_size: float = 150000 + """Average size of split antitarget bins (results are approximate)""" + min_size: float | None = None + """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" + + +class CnvkitCoverage(SnappyModel): + count: bool = False + """Get read depths by counting read midpoints within each bin.""" + min_mapq: int = 0 + """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" + + +class CnvkitReference(SnappyModel): + cluster: bool = False + """Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles.""" + min_cluster_size: int = 4 + """Minimum cluster size to keep in reference profiles.""" + no_gc: bool = False + """Skip GC correction.""" + no_edge: bool = None + """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" + no_rmask: bool = False + """Skip RepeatMasker correction.""" + + +class CnvkitFix(SnappyModel): + cluster: bool = False + """Compare and use cluster-specific values present in the reference profile.""" + no_gc: bool = False + """Skip GC correction.""" + no_edge: bool = False + """Skip edge correction.""" + no_rmask: bool = False + """Skip RepeatMasker correction.""" + + +class CnvkitSegment(SnappyModel): + method: CnvkitSegmentationMethod = CnvkitSegmentationMethod.CBS + """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" + threshold: float = 0.0001 + """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + drop_outliers: float = 10 + """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" + smooth_cbs: bool = False + + min_variant_depth: float = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + + @model_validator(mode="after") + def ensure_smooth_for_cbs_only(self) -> typing.Self: + if self.smooth_cbs and self.method != CnvkitSegmentationMethod.CBS: + raise ValueError("'smooth_cbs' option can be used only with 'CBS' segmentation method") + return self + + +class CnvkitCall(SnappyModel): + method: CnvkitCallingMethod = CnvkitCallingMethod.THRESHOLD + """Calling method.""" + thresholds: str | None = None + """Hard thresholds for calling each integer copy number, separated by commas""" + center: CnvkitCenterMethod | None = CnvkitCenterMethod.MEDIAN + """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" + center_at: float | None = None + """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" + filter: CnvkitFilterMethod | None = None + """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" + ploidy: float | None = 2 + """Ploidy of the sample cells.""" + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + + min_variant_depth: float = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + + +class CnvkitBintest(SnappyModel): + alpha: float = 0.005 + """Significance threhold.""" + target: bool = False + """Test target bins only; ignore off-target bins.""" + + +class CnvkitPlotDiagram(SnappyModel): + threshold: float = 0.5 + """Copy number change threshold to label genes.""" + min_probes: int = 3 + """Minimum number of covered probes to label a gene.""" + no_shift_xy: bool = False + + +class CnvkitPlotScatter(SnappyModel): + antitarget_marker: str | None = None + """Plot antitargets using this symbol when plotting in a selected chromosomal region.""" + by_bin: bool = False + """Plot data x-coordinates by bin indices instead of genomic coordinates.""" + segment_color: str | None = None + """Plot segment lines in this color. Value can be any string accepted by matplotlib.""" + trend: bool = False + """Draw a smoothed local trendline on the scatter plot.""" + y_max: float | None = None + """y-axis upper limit.""" + y_min: float | None = None + """y-axis lower limit.""" + fig_size: typing.Tuple[float, float] | None = None + """Width and height of the plot in inches.""" + + min_variant_depth: float = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + + +class CnvkitPlot(SnappyModel): + diagram: CnvkitPlotDiagram = CnvkitPlotDiagram() + scatter: CnvkitPlotScatter = CnvkitPlotScatter() + + +class CnvkitReportMetrics(SnappyModel): + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + + +class CnvkitReportSegmetrics(SnappyModel): + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + alpha: float = 0.05 + """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" + bootstrap: int = 100 + """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" + + +class CnvkitReport(enum.StrEnum): + METRICS = "metrics" + SEGMETRICS = "segmetrics" + + +class Cnvkit(SnappyModel): + panel_of_normals: PanelOfNormals | None = None + + variants: VariantTool | None = None + + purity: Purity + """ + When present, purity estimates can be used for calling segments. The requested tool must be configured. + Or the purity can be provided in the samplesheet, as an extra information attached to the library. + + Note that PureCN cannot be used to estimate purity for WGS samples (because PureCN is WES & Panel-only). + TODO: This should be tested by a validation method, I don't know how to do (Till help!!) + TODO: The exact name is not yet set. + """ + + access: CnvkitAccess = CnvkitAccess() + target: CnvkitTarget = CnvkitTarget() + antitarget: CnvkitAntitarget = CnvkitAntitarget() + coverage: CnvkitCoverage = CnvkitCoverage() + + reference: CnvkitReference | None = None + + @model_validator(mode="after") + def set_default_reference(self) -> typing.Self: + if self.reference is None and not self.panel_of_normals.enabled: + self.reference = CnvkitReference() + return self + + fix: CnvkitFix = CnvkitFix() + segment: CnvkitSegment = CnvkitSegment() + call: CnvkitCall = CnvkitCall() + bintest: CnvkitBintest = CnvkitBintest() + + use_male_reference: bool = False + """Create/use a male reference. Must be identical to panel of normals creation, when using one""" + + plots: typing.List[CnvkitPlot] = [] + + reports: typing.List[CnvkitReport] = [] + metrics: CnvkitReportMetrics | None = None + + # @validator("metrics") + # def get_default_reference(cls, v, values) -> CnvkitReportMetrics | None: + # if v is None and "metrics" in values["reports"]: + # return CnvkitReportMetrics() + # return None + + segmetrics: CnvkitReportSegmetrics | None = None + + # @validator("segmetrics") + # def get_default_reference(cls, v, values) -> CnvkitReportSegmetrics | None: + # if v is None and "segmetrics" in values["reports"]: + # return CnvkitReportSegmetrics() + # return None + + +class SomaticCnvCalling(SnappyStepModel): + path_ngs_mapping: str + """Path to bam files""" + + tools: Tools + """Tools for WGS & WES data""" + + path_target_interval_list_mapping: typing.List[LibraryKitDefinition] | None = None + + sex: Sex = Sex.DIPLOID_ONLY + + cnvkit: Cnvkit + purecn: PureCn | None = None + sequenza: Sequenza | None = None + control_freec: ControlFreec | None = None + + mutect2: Mutect2 | None = None + + default_ploidy: float | None = None + + # @model_validator(mode="after") + # def ensure_single_pon_step(self) -> typing.Self: + # """ + # I am not sure this is absolutely required. + # I am trying to avoid registering the panel_of_normals step when initializing SomaticCnvCalling + # """ + # pon_steps = set() + # for tool in itertools.chain(self.tools.wgs, self.tools.wes): + # tool_config = getattr(self, tool) + # if ( + # tool_config + # and getattr(tool_config, "use_panel_of_normals") + # and tool_config.use_panel_of_normals == PanelOfNormalsUse.PREVIOUS_STEP + # ): + # pon_steps.add(str(tool_config.panel_of_normals.panel_of_normals)) + # if len(pon_steps) > 1: + # raise ValueError("Too many panel_of_normals steps") + # return self From dae4c28a231cf2b78b1b3bbe988447b8c3a28aef Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 6 Nov 2024 14:40:10 +0100 Subject: [PATCH 08/46] refactor: renamed params to args, automation of md5 checksum handling, fix path in cnvkit wrappers, better handling of arguments & fix method to obtain input, parameters & output from snakemake --- .../workflows/panel_of_normals/Snakefile | 46 +- .../workflows/panel_of_normals/__init__.py | 158 +++---- .../workflows/panel_of_normals/model.py | 13 +- .../wrappers/cnvkit/access/wrapper.py | 9 +- .../wrappers/cnvkit/antitarget/wrapper.py | 35 +- .../wrappers/cnvkit/autobin/environment.yaml | 1 + .../wrappers/cnvkit/autobin/wrapper.py | 36 ++ .../wrappers/cnvkit/cnvkit_wrapper.py | 30 +- .../wrappers/cnvkit/coverage/wrapper.py | 20 +- .../wrappers/cnvkit/reference/wrapper.py | 38 +- .../wrappers/cnvkit/target/wrapper.py | 16 +- .../test_workflows_panel_of_normals.py | 51 +-- .../test_workflows_panel_of_normals_wgs.py | 395 ------------------ 13 files changed, 255 insertions(+), 593 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py delete mode 100644 tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 5acd85d39..923431644 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -114,7 +114,7 @@ rule panel_of_normals_access_run: log: **wf.get_log_file("access", "run"), params: - **wf.get_params("access", "run"), + **wf.get_args("access", "run"), wrapper: wf.wrapper_path("cnvkit/access") @@ -135,7 +135,7 @@ rule panel_of_normals_cnvkit_access: log: **wf.get_log_file("cnvkit", "access"), params: - **{"args": wf.get_params("cnvkit", "access")}, + **{"args": wf.get_args("cnvkit", "access")}, wrapper: wf.wrapper_path("cnvkit/access") @@ -153,7 +153,7 @@ rule panel_of_normals_cnvkit_autobin: log: **wf.get_log_file("cnvkit", "autobin"), params: - **{"args": wf.get_params("cnvkit", "autobin")}, + **{"args": wf.get_args("cnvkit", "autobin")}, wrapper: wf.wrapper_path("cnvkit/autobin") @@ -171,7 +171,7 @@ rule panel_of_normals_cnvkit_target: log: **wf.get_log_file("cnvkit", "target"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/target") @@ -189,7 +189,7 @@ rule panel_of_normals_cnvkit_antitarget: log: **wf.get_log_file("cnvkit", "antitarget"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "antitarget")}, wrapper: wf.wrapper_path("cnvkit/antitarget") @@ -207,7 +207,7 @@ rule panel_of_normals_cnvkit_coverage: log: **wf.get_log_file("cnvkit", "coverage"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "coverage")}, wrapper: wf.wrapper_path("cnvkit/coverage") @@ -228,27 +228,27 @@ rule panel_of_normals_cnvkit_create_panel: log: **wf.get_log_file("cnvkit", "create_panel"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "create_panel")}, wrapper: wf.wrapper_path("cnvkit/reference") -rule panel_of_normals_cnvkit_report: - input: - unpack(wf.get_input_files("cnvkit", "report")), - output: - **wf.get_output_files("cnvkit", "report"), - threads: wf.get_resource("cnvkit", "report", "threads") - resources: - time=wf.get_resource("cnvkit", "report", "time"), - memory=wf.get_resource("cnvkit", "report", "memory"), - partition=wf.get_resource("cnvkit", "report", "partition"), - log: - **wf.get_log_file("cnvkit", "report"), - params: - **{"args": wf.get_params("cnvkit", "target")}, - wrapper: - wf.wrapper_path("cnvkit/report") +# rule panel_of_normals_cnvkit_report: +# input: +# unpack(wf.get_input_files("cnvkit", "report")), +# output: +# **wf.get_output_files("cnvkit", "report"), +# threads: wf.get_resource("cnvkit", "report", "threads") +# resources: +# time=wf.get_resource("cnvkit", "report", "time"), +# memory=wf.get_resource("cnvkit", "report", "memory"), +# partition=wf.get_resource("cnvkit", "report", "partition"), +# log: +# **wf.get_log_file("cnvkit", "report"), +# params: +# **{"args": wf.get_args("cnvkit", "report")}, +# wrapper: +# wf.wrapper_path("cnvkit/report") # Panel of normals (PureCN) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 77d76abc9..07656d4a2 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -273,19 +273,12 @@ def _get_extra_info(library): @dictify def _get_log_file(tpl, has_sh=False): """Return all log files files""" - ext_dict = { - "conda_list": "conda_list.txt", - "conda_list_md5": "conda_list.txt.md5", - "conda_info": "conda_info.txt", - "conda_info_md5": "conda_info.txt.md5", - "log": "log", - "log_md5": "log.md5", - } + ext_dict = {"conda_list": "conda_list.txt", "conda_info": "conda_info.txt", "log": "log"} if has_sh: ext_dict["sh"] = "sh" - ext_dict["sh_md5"] = ext_dict["sh"] + ".md5" for key, ext in ext_dict.items(): yield key, tpl + "." + ext + yield key + "_md5", tpl + "." + ext + ".md5" class PureCnStepPart(PanelOfNormalsStepPart): @@ -564,47 +557,49 @@ def get_input_files(self, action): "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, "create_panel": self._get_input_files_create_panel, - "report": self._get_input_files_report, } return mapping[action] - def get_params(self, action): + def get_args(self, action): """Return panel of normal files""" if action == "access": - return self._get_params_access + return self._get_args_access elif action == "autobin": - return self._get_params_autobin + return self._get_args_autobin elif action == "target": - return self._get_params_target + return self._get_args_target elif action == "antitarget": - return self._get_params_antitarget + return self._get_args_antitarget elif action == "coverage": - return self._get_params_coverage + return self._get_args_coverage elif action == "create_panel": - return self._get_params_create_panel - elif action == "report": - return self._get_params_report + return self._get_args_create_panel else: self._validate_action(action) def get_output_files(self, action): """Return panel of normal files""" + output_files = None if action == "access": - return self._get_output_files_access() + output_files = self._get_output_files_access() elif action == "autobin": - return self._get_output_files_autobin() + output_files = self._get_output_files_autobin() elif action == "target": - return self._get_output_files_target() + output_files = self._get_output_files_target() elif action == "antitarget": - return self._get_output_files_antitarget() + output_files = self._get_output_files_antitarget() elif action == "coverage": - return self._get_output_files_coverage() + output_files = self._get_output_files_coverage() elif action == "create_panel": - return self._get_output_files_create_panel() - elif action == "report": - return self._get_output_files_report() + output_files = self._get_output_files_create_panel() else: self._validate_action(action) + return dict( + zip( + list(output_files.keys()) + [k + "_md5" for k in output_files.keys()], + list(output_files.values()) + [v + ".md5" for v in output_files.values()], + ) + ) @classmethod def get_log_file(cls, action): @@ -616,7 +611,6 @@ def get_log_file(cls, action): "antitarget": "work/{mapper}.cnvkit/log/cnvkit.antitarget", "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage", "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", - "report": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report", } assert action in cls.actions return cls._get_log_file(tpls[action], has_sh=True) @@ -624,8 +618,11 @@ def get_log_file(cls, action): def _get_input_files_access(self, wildcards): return {} - def _get_params_access(self, wildcards): - return {"reference": self.w_config.static_data_config.reference.path} + def _get_args_access(self, wildcards): + return { + "reference": self.w_config.static_data_config.reference.path, + "min_gap_size": self.config.cnvkit.min_gap_size, + } def _get_output_files_access(self): return {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"} @@ -645,11 +642,11 @@ def _get_input_files_autobin(self, wildcards): input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) return input_files - def _get_params_autobin(self, wildcards): + def _get_args_autobin(self, wildcards): assert ( self.libraryType == LibraryType.WGS ), "Trying to estimate average target size for non-WGS samples" - params = {} + params = {"bp_per_bin": 50000} if self.name in self.config.tools and self.config.cnvkit: if self.config.cnvkit.get("access", "") == "": params["method"] = "wgs" @@ -659,11 +656,7 @@ def _get_params_autobin(self, wildcards): return params def _get_output_files_autobin(self): - return { - "result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt", - "target": "$TMPDIR/{mapper}.targets.bed", - "antitarget": "$TMPDIR/{mapper}.antitarget.bed", - } + return {"result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt"} def _get_input_files_target(self, wildcards): """Helper wrapper function to estimate target average size in wgs mode""" @@ -676,26 +669,23 @@ def _get_input_files_target(self, wildcards): ) return input_files - def _get_params_target(self, wildcards): + def _get_args_target(self, wildcards): params = {} if self.name in self.config.tools: if self.libraryType == LibraryType.WES: params["target"] = self.config.cnvkit.path_target_regions if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": params["target"] = self.config.cnvkit.get("access") - if "features" in self.w_config.static_data_config: + if self.w_config.static_data_config.get("features", None): params["annotate"] = self.w_config.static_data_config.features.path - if self.config.cnvkit.get("split", False): + if self.config.cnvkit.get("split", True): params["split"] = True if self.config.cnvkit.get("target_avg_size", None): params["avg_size"] = self.config.cnvkit.get("target_avg_size") return params def _get_output_files_target(self): - return { - "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", - } + return {"target": "work/{mapper}.cnvkit/out/cnvkit.target.bed"} def _get_input_files_antitarget(self, wildcards): """Helper wrapper function for computing antitarget locations""" @@ -705,22 +695,19 @@ def _get_input_files_antitarget(self, wildcards): "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed".format(**wildcards), } - def _get_params_antitarget(self, widlcards): + def _get_args_antitarget(self, wildcards): params = {} if self.name in self.config.tools: params = { "avg_size": self.config.cnvkit.antitarget_avg_size, "min_size": self.config.cnvkit.min_size, } - if self.config.cnvkit.get("access", "") == "": + if self.config.cnvkit.get("access", "") != "": params["access"] = self.config.cnvkit.get("access") return params def _get_output_files_antitarget(self): - return { - "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", - } + return {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"} def _get_input_files_coverage(self, wildcards): """Helper wrapper function for computing coverage""" @@ -733,7 +720,7 @@ def _get_input_files_coverage(self, wildcards): "bai": bam + ".bai", } - def _get_params_coverage(self, wildcards): + def _get_args_coverage(self, wildcards): params = {} if self.name in self.config.tools: params = { @@ -747,7 +734,6 @@ def _get_params_coverage(self, wildcards): def _get_output_files_coverage(self): return { "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", - "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", } def _get_input_files_create_panel(self, wildcards): @@ -763,9 +749,9 @@ def _get_input_files_create_panel(self, wildcards): ] else: antitargets = [] - return {"references": targets + antitargets} + return {"normals": targets + antitargets} - def _get_params_create_panel(self, wildcards): + def _get_args_create_panel(self, wildcards): params = {} if self.name in self.config.tools: params = { @@ -774,8 +760,8 @@ def _get_params_create_panel(self, wildcards): if self.config.cnvkit.get("cluster", False): params["cluster"] = True params["min_cluster_size"] = self.config.cnvkit.min_cluster_size - if "sample_sex" in self.config.cnvkit: - params["sample_sex"] = self.config.cnvkit.gender + if self.config.cnvkit.get("sample_sex"): + params["sample_sex"] = self.config.cnvkit.sample_sex if self.config.cnvkit.get("male_reference", False): params["male_reference"] = True if self.config.cnvkit.get("diploid_parx_genome", None): @@ -792,33 +778,7 @@ def _get_params_create_panel(self, wildcards): return params def _get_output_files_create_panel(self): - return { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - } - - def _get_input_files_report(self, wildcards): - """Helper wrapper function for the panel of normals report""" - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" - targets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries - ] - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" - antitargets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries - ] - return { - "target": targets, - "antitarget": antitargets, - } - - def _get_output_files_report(self): - return { - "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", - "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", - "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", - "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", - } + return {"panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"} class AccessStepPart(PanelOfNormalsStepPart): @@ -847,7 +807,7 @@ def get_output_files(self, action): tpl = "work/access/out/access.bed" return {"access": tpl, "access_md5": tpl + ".md5"} - def get_params(self, action): + def get_args(self, action): # Validate action self._validate_action(action) if self.name in self.config.tools: @@ -914,32 +874,25 @@ def get_result_files(self): """ result_files = [] - log_ext_list = [ - "log", - "log.md5", - "conda_list.txt", - "conda_list.txt.md5", - "conda_info.txt", - "conda_info.txt.md5", - ] + log_ext_list = ["log", "conda_list.txt", "conda_info.txt"] if "mutect2" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals.{ext}" - ext_list = ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") + ext_list = ("vcf.gz", "vcf.gz.tbi") result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.genomicsDB.{ext}" - ext_list = ("tar.gz", "tar.gz.md5") + ext_list = ("tar.gz",) result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) if "cnvkit" in set(self.config.tools) & set(TOOLS): tpls = [ - ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed", "bed.md5")), - ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed",)), + ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed",)), ( "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", - ("cnn", "cnn.md5"), + ("cnn",), ), # ( # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", @@ -958,7 +911,7 @@ def get_result_files(self): "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", ] for tpl in tpls: - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) # tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" # result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) @@ -966,14 +919,14 @@ def get_result_files(self): tpl = "output/access/out/access.bed" result_files.extend([tpl + md5 for md5 in ("", ".md5")]) tpl = "output/access/log/access.{ext}" - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) if "purecn" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.{ext}" - ext_list = ("rds", "rds.md5") + ext_list = ("rds",) result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.purecn/out/{mapper}.purecn.mapping_bias.{ext}" - ext_list = ("rds", "rds.md5") + ext_list = ("rds",) result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) @@ -981,7 +934,7 @@ def get_result_files(self): self.config.purecn.enrichment_kit_name, self.config.purecn.genome_name, ) - ext_list = ("list", "list.md5", "bed.gz", "bed.gz.md5", "bed.gz.tbi", "bed.gz.tbi.md5") + ext_list = ("list", "bed.gz", "bed.gz.tbi") result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/purecn/log/{}_{}.{{ext}}".format( self.config.purecn.enrichment_kit_name, @@ -995,3 +948,4 @@ def _expand_result_files(self, tpl, ext_list): for mapper in self.w_config.step_config["ngs_mapping"].tools.dna: for ext in ext_list: yield tpl.format(mapper=mapper, ext=ext) + yield tpl.format(mapper=mapper, ext=ext) + ".md5" diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index b7995e32c..802b5e417 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -75,19 +75,22 @@ class CnvKit(SnappyModel): access: str = "" """Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run)""" - target_avg_size: float | None = None + min_gap_size: int = 5000 + """[access] Minimum gap size between accessible regions""" + + target_avg_size: int | None = None """[target] Average size of split target bins (None: use default value, or use autobin for wgs)""" - split: bool = False + split: bool = True """[target] Split large intervals into smaller ones""" - bp_per_bin: float = 50000 + bp_per_bin: int = 50000 """[autobin] Expected base per bin""" - antitarget_avg_size: float = 0 + antitarget_avg_size: int = 0 """[antitarget] Average size of antitarget bins (0: use default value)""" - min_size: float = 0 + min_size: int = 0 """[antitarget] Min size of antitarget bins (0: use default value)""" min_mapq: int = 0 diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 5954500e9..c93c981c5 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -15,15 +15,18 @@ __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ - --min-gap-size {snakemake.params.min_gap_size} \ + --min-gap-size {args[min_gap_size]} \ {exclude} \ - {snakemake.params.reference} + {args[reference]} """.format( snakemake=snakemake, - exclude=" ".join([f"--exclude {x}" for x in snakemake.params.exclude]) if snakemake.params.exclude else "", + args=args, + exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]) if "exclude" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index da3b440f0..596626831 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -1,20 +1,35 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py antitarget""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" -cmd = r""" -cnvkit.py antitarget \ - -o {snakemake.output.region} \ - --avg-size {snakemake.params.avg_size} --min-size {snakemake.params.min_size} \ - {access} \ - {snakemake.input.target} -""".format( - snakemake=snakemake, - access=f"--access {snakemake.params.access}" if snakemake.params.access else "", -) +args = snakemake.params.get("args", {}) + +if snakemake.input.get("target", "") != "": + cmd = r""" + cnvkit.py antitarget \ + -o {snakemake.output.antitarget} \ + --avg-size {args['avg_size']} --min-size {args['min_size']} \ + {access} \ + {snakemake.input.target} + """.format( + snakemake=snakemake, + args=args, + access=f"--access {args['access']}" if "access" in args else "", + ) +else: + cmd = f"touch {snakemake.output.antitarget}" CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml b/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py new file mode 100644 index 000000000..ce913b505 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py autobin (replicating cnvkit batch)""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py autobin --method {args[method]} \ + {out_target} {out_antitarget} \ + {access} {target} \ + --bp-per-bin {args[bp_per_bin]} \ + {snakemake.input.bams} \ + > {snakemake.output.result} +""".format( + snakemake=snakemake, + args=args, + out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", + out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", + access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) else "", + target=f"--targets {args['target']}" if "target" in args else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py index a6a8accdf..08c734504 100644 --- a/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py @@ -1,5 +1,7 @@ """Abstract wrapper for cnvkit.py""" +import os +import stat import textwrap from snakemake.shell import shell @@ -10,7 +12,7 @@ class CnvkitWrapper: header = r""" - # Also pipe everything to log file + # Pipe everything to log file if [[ -n "{snakemake.log.log}" ]]; then if [[ "$(set +e; tty; set -e)" != "" ]]; then rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) @@ -21,13 +23,18 @@ class CnvkitWrapper: fi fi + # Compute md5 except when filename ends with .md5 compute_md5() {{ fn=$1 f=$(basename $fn) d=$(dirname $fn) - pushd $d 1> /dev/null 2>&1 - md5sum $f > $f.md5 - popd 1> /dev/null 2>&1 + ext="${{f##*.}}" + if [[ $ext != "md5" ]] + then + pushd $d 1> /dev/null 2>&1 + md5sum $f > $f.md5 + popd 1> /dev/null 2>&1 + fi }} # Write out information about conda installation. @@ -72,14 +79,23 @@ def preamble(self): def run(self) -> None: self.preamble() - with open(self.snakemake.log.sh, "wt") as f: + cmd_path = self.snakemake.log.sh + with open(cmd_path, "wt") as f: print( textwrap.dedent( - "\n".join((CnvkitWrapper.header, self.command, CnvkitWrapper.footer)) + "\n".join( + ( + CnvkitWrapper.header.format(snakemake=self.snakemake), + self.command, + CnvkitWrapper.footer.format(snakemake=self.snakemake), + ) + ) ), file=f, ) + current_permissions = stat.S_IMODE(os.lstat(cmd_path).st_mode) + os.chmod(cmd_path, current_permissions | stat.S_IXUSR) - shell(self.snakemake.log.sh) + shell(cmd_path) shell(CnvkitWrapper.md5_log.format(log=str(self.snakemake.log.log))) diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index 678a7e348..a71ef8e8e 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -1,20 +1,32 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py coverage""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" -cnvkit.py coverage --processes {snakemake.params.processes} \ +cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {snakemake.params.reference} - --min-mapq {snakemake.params.min_mapq} {count} \ + --fasta {args[reference]} \ + --min-mapq {args[min_mapq]} {count} \ {snakemake.input.bam} {snakemake.input.intervals} """.format( snakemake=snakemake, - count="--count" if snakemake.params.count else "", + args=args, + count="--count" if "count" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 4d53e9508..82f71ce41 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -1,32 +1,44 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py reference""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" cnvkit.py reference \ - -o {snakemake.output.reference} \ - --fasta {snakemake.params.reference} \ + -o {snakemake.output.panel} \ + --fasta {args[reference]} \ {cluster} {min_cluster_size} \ {sample_sex} {male_reference} {diploid_parx_genome} \ {no_gc} {no_edge} {no_rmask} \ {target} {antitarget} {normals} """.format( snakemake=snakemake, - cluster="--cluster" if snakemake.params.cluster else "", - min_cluster_size=f"--min-cluster-size {snakemake.params.min_cluster_size}" if snakemake.params.cluster and "min_cluster_size" in snakemake.params else "", - no_gc="--no-gc" if snakemake.params.no_gc else "", - no_edge="--no-edge" if snakemake.params.no_edge else "", - no_rmask="--no-rmask" if snakemake.params.no_rmask else "", - sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", - male_reference="--male-reference" if snakemake.params.male_reference else "", - diploid_parx_genome=f"--diploid_parx_genome {snakemake.params.diploid_parx_genome}" if "diploid_parx_genome" in snakemake.params else "", - target=f"--target {snakemake.input.target}" if "target" in snakemake.input else "", - antitarget=f"--antitarget {snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", - normals=" ".join(snakemake.input.normals) if "normals" in snakemake.input else "", + args=args, + cluster="--cluster" if "cluster" in args else "", + min_cluster_size=f"--min-cluster-size {args['min_cluster_size']}" if "cluster" in args and "min_cluster_size" in args else "", + no_gc="--no-gc" if "no_gc" in args else "", + no_edge="--no-edge" if "no_edge" in args else "", + no_rmask="--no-rmask" if "no_rmask" in args else "", + sample_sex=f"--sample-sex {args['sample_sex']}" if "sample_sex" in args else "", + male_reference="--male-reference" if "male_reference" in args else "", + diploid_parx_genome=f"--diploid_parx_genome {args['diploid_parx_genome']}" if "diploid_parx_genome" in args else "", + target=f"--target {snakemake.input.target}" if snakemake.input.get("target", None) else "", + antitarget=f"--antitarget {snakemake.input.antitarget}" if snakemake.input.get("antitarget", None) else "", + normals=" ".join(snakemake.input.normals) if snakemake.input.get("normals", None) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index 457718684..fb32e1710 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -16,31 +16,35 @@ __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + # WGS: targets are all accessible regions, WES: targets are baits -interval = snakemake.input.access if "access" in snakemake.input else snakemake.params.target +interval = snakemake.input.access if snakemake.input.get("access", None) else args["target"] -if "avg_size" in snakemake.input: +if snakemake.input.get("avg_size", "") != "": pattern = re.compile("^Target:[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)$") with open(snakemake.input.avg_size) as f: for line in f: m = pattern.match(line) if m: - avg_size = float(m.groups()[4]) + avg_size = int(float(m.groups()[4])) break else: - avg_size = snakemake.params.avg_size + avg_size = args["avg_size"] cmd = r""" cnvkit.py target \ -o {snakemake.output.target} \ - {avg_size} {split} \ + {avg_size} {split} {annotate} \ {interval} """.format( snakemake=snakemake, + args=args, interval=interval, avg_size=f"--avg-size {avg_size}", - split=f"--split" if snakemake.params.split else "", + split=f"--split" if "split" in args and args["split"] else "", + annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index c596a734b..46fd5a0fe 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -225,6 +225,7 @@ def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", ], + "access": "work/bwa.cnvkit/out/cnvkit.access.bed", } actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) assert actual == expected @@ -282,7 +283,7 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow } ) expected = { - "references": [ + "normals": [ "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], @@ -291,75 +292,75 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow assert actual == expected -def test_cnvkit_step_part_get_params_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_access()""" +def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = {"reference": "/path/to/ref.fa"} - actual = panel_of_normals_workflow.get_params("cnvkit", "access")(wildcards) + expected = {"reference": "/path/to/ref.fa", "min_gap_size": 5000} + actual = panel_of_normals_workflow.get_args("cnvkit", "access")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_autobin(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_autobin()""" +def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_autobin()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = {"method": "wgs"} - actual = panel_of_normals_workflow.get_params("cnvkit", "autobin")(wildcards) + expected = {"method": "wgs", "bp_per_bin": 50000} + actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_target()""" +def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_target()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = {"annotate": "/path/to/annotations.gtf"} - actual = panel_of_normals_workflow.get_params("cnvkit", "target")(wildcards) + expected = {"annotate": "/path/to/annotations.gtf", "split": True, "target": ""} + actual = panel_of_normals_workflow.get_args("cnvkit", "target")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_antitarget()""" +def test_cnvkit_step_part_get_args_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_antitarget()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) expected = {"avg_size": 0, "min_size": 0} - actual = panel_of_normals_workflow.get_params("cnvkit", "antitarget")(wildcards) + actual = panel_of_normals_workflow.get_args("cnvkit", "antitarget")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_coverage()""" +def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) expected = {"reference": "/path/to/ref.fa", "min_mapq": 0} - actual = panel_of_normals_workflow.get_params("cnvkit", "coverage")(wildcards) + actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_create_panel(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_create_panel()""" +def test_cnvkit_step_part_get_args_create_panel(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) expected = {"reference": "/path/to/ref.fa", "no_edge": True} - actual = panel_of_normals_workflow.get_params("cnvkit", "create_panel")(wildcards) + actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")(wildcards) assert actual == expected @@ -504,14 +505,14 @@ def test_access_step_part_get_input_files_run(panel_of_normals_workflow): assert panel_of_normals_workflow.get_input_files("access", "run") is None -def test_access_step_part_get_params_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_params_run()""" +def test_access_step_part_get_args_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_args_run()""" expected = { "reference": "/path/to/ref.fa", "exclude": ["/path/to/exclude.bed"], "min_gap_size": 0 } - actual = panel_of_normals_workflow.get_params("access", "run") + actual = panel_of_normals_workflow.get_args("access", "run") assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py deleted file mode 100644 index e1f4c2b26..000000000 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py +++ /dev/null @@ -1,395 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests for the panel_of_normals workflow module code""" - -import textwrap - -import pytest -import ruamel.yaml as ruamel_yaml -from snakemake.io import Wildcards - -from snappy_pipeline.workflows.panel_of_normals import PanelOfNormalsWorkflow - -from .common import get_expected_log_files_dict -from .conftest import patch_module_fs - - -@pytest.fixture(scope="module") # otherwise: performance issues -def minimal_config(): - """Return YAML parsing result for (cancer) configuration""" - yaml = ruamel_yaml.YAML() - return yaml.load( - textwrap.dedent( - r""" - static_data_config: - reference: - path: /path/to/ref.fa - cosmic: - path: /path/to/cosmic.vcf.gz - dbsnp: - path: /path/to/dbsnp.vcf.gz - - step_config: - ngs_mapping: - tools: - dna: ['bwa'] - bwa: - path_index: /path/to/bwa/index.fa - - panel_of_normals: - path_ngs_mapping: NGS_MAPPING/ - tools: ['cnvkit'] - cnvkit: - path_target_regions: "" # WGS mode - path_normals_list: "" - - data_sets: - first_batch: - file: sheet.tsv - search_patterns: - - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} - search_paths: ['/path'] - type: matched_cancer - naming_scheme: only_secondary_id - """ - ).lstrip() - ) - - -@pytest.fixture -def panel_of_normals_workflow( - dummy_workflow, - minimal_config, - config_lookup_paths, - work_dir, - config_paths, - cancer_sheet_fake_fs, - aligner_indices_fake_fs, - mocker, -): - """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" - # Patch out file-system related things in abstract (the crawling link in step is defined there) - patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) - patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) - # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we - # can obtain paths from the function as if we really had a NGSMappingPipelineStep there - dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} - # Construct the workflow object - return PanelOfNormalsWorkflow( - dummy_workflow, - minimal_config, - config_lookup_paths, - config_paths, - work_dir, - ) - - -# Tests for CnvkitStepPart ------------------------------------------------------------------------ - - -def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_target()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "bams": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", - ], - "bais": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_antitarget()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) - assert actual == {} - - -def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_coverage()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - "normal_library": "P001-N1-DNA1-WGS1", - } - ) - expected = { - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", - "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_create_panel()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "target": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - "logs": [ - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_info.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_info.txt", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_report()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "target": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_target()""" - expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_antitarget()""" - expected = { - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_coverage()""" - expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_create_panel()""" - expected = { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - "log": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz", - "log_md5": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_report()""" - expected = { - "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", - "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", - "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", - "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_target()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_antitarget()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_coverage()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_create_panel()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_report()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): - """Tests CvnkitStepPart.get_resource_usage()""" - # Define expected: default defined workflow.abstract - target_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - antitarget_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - coverage_expected_dict = { - "threads": 8, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - reference_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - report_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - - # Evaluate action `target` - for resource, expected in target_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'target'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource)() - assert actual == expected, msg_error - - # Evaluate action `antitarget` - for resource, expected in antitarget_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource)() - assert actual == expected, msg_error - - # Evaluate action `coverage` - for resource, expected in coverage_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource)() - assert actual == expected, msg_error - - # Evaluate action `create_panel` - for resource, expected in reference_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() - assert actual == expected, msg_error - - # Evaluate action `report` - for resource, expected in report_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'report'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource)() - assert actual == expected, msg_error - - -# PanelOfNormalsWorkflow -------------------------------------------------------------------------- - - -def test_panel_of_normals_workflow(panel_of_normals_workflow): - """Test simple functionality of the workflow""" - # Check created sub steps - expected = ["access", "cnvkit", "link_out", "mutect2", "purecn"] - actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) - assert actual == expected - expected = [] - - # Now for basic cnvkit files (panel of normal only) - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" - expected += [ - tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) - ] - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" - for substep in ("target", "antitarget"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("bed", "bed.md5") - for mapper in ("bwa",) - ] - tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" - for substep in ("sex", "metrics"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("tsv", "tsv.md5") - for mapper in ("bwa",) - ] - # add log files - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" - for substep in ("target", "antitarget", "panel_of_normals", "report"): - for mapper in ("bwa",): - expected += get_expected_log_files_dict( - base_out=tpl.format(mapper=mapper, substep=substep) - ).values() - # add merged log - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{chksum}" - for mapper in ("bwa",): - for chksum in ("", ".md5"): - expected += [tpl.format(mapper=mapper, chksum=chksum)] - - expected = list(sorted(expected)) - actual = list(sorted(panel_of_normals_workflow.get_result_files())) - assert actual == expected From 6689ceb2808bd4ea5028a5aa217b9f4f24ea76a1 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 6 Nov 2024 14:41:36 +0100 Subject: [PATCH 09/46] fix: allow python 3.12 syntax in models to co-exist with pre-3.12 syntax in wrappers --- snappy_wrappers/__init__.py | 4 +++- snappy_wrappers/wrappers/cnvkit/environment.yaml | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/snappy_wrappers/__init__.py b/snappy_wrappers/__init__.py index 1b25644c8..ded1b29ed 100644 --- a/snappy_wrappers/__init__.py +++ b/snappy_wrappers/__init__.py @@ -1,8 +1,10 @@ # -*- coding: utf-8 -*- -from snappy_pipeline.version import __version__ +# from snappy_pipeline.version import __version__ __author__ = """Manuel Holtgrewe""" __email__ = "manuel.holtgrewe@bih-charite.de" +__version__ = "0.2.1" + __all__ = ["__version__"] diff --git a/snappy_wrappers/wrappers/cnvkit/environment.yaml b/snappy_wrappers/wrappers/cnvkit/environment.yaml index 2def14828..b4cb42e86 100644 --- a/snappy_wrappers/wrappers/cnvkit/environment.yaml +++ b/snappy_wrappers/wrappers/cnvkit/environment.yaml @@ -2,6 +2,6 @@ channels: - conda-forge - bioconda dependencies: - - python=3.12 - - cnvkit==0.9.8 - - htslib=1.21 + - python=3.10 + - cnvkit=0.9.11 + - htslib==1.21 From 9ab8870afbe808e3f8b332030f1dc8790b9c39a1 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 6 Nov 2024 16:26:17 +0100 Subject: [PATCH 10/46] fix: fix autobin logic because normals & reference always present --- .../workflows/panel_of_normals/__init__.py | 34 +++++++------------ .../wrappers/cnvkit/target/wrapper.py | 7 ++-- .../test_workflows_panel_of_normals.py | 2 +- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 07656d4a2..240643323 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -633,27 +633,19 @@ def _get_input_files_autobin(self, wildcards): ), "Trying to estimate average target size for non-WGS samples" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" - bams = [ - ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) - for x in self.normal_libraries - ] - input_files = {"bams": bams} - if self.config.cnvkit.get("access", "") == "": - input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) - return input_files + return { + "bams": [ + ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) + for x in self.normal_libraries + ], + "access": "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards), + } def _get_args_autobin(self, wildcards): assert ( self.libraryType == LibraryType.WGS ), "Trying to estimate average target size for non-WGS samples" - params = {"bp_per_bin": 50000} - if self.name in self.config.tools and self.config.cnvkit: - if self.config.cnvkit.get("access", "") == "": - params["method"] = "wgs" - else: - params["method"] = "amplicon" - params["target"] = self.config.cnvkit.get("access") - return params + return {"method": "wgs", "bp_per_bin": 50000} def _get_output_files_autobin(self): return {"result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt"} @@ -663,10 +655,10 @@ def _get_input_files_target(self, wildcards): input_files = {} if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) - if self.config.cnvkit.get("target_avg_size", None) is None: - input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( - **wildcards - ) + if self.config.cnvkit.get("target_avg_size", None) is None: + input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( + **wildcards + ) return input_files def _get_args_target(self, wildcards): @@ -674,7 +666,7 @@ def _get_args_target(self, wildcards): if self.name in self.config.tools: if self.libraryType == LibraryType.WES: params["target"] = self.config.cnvkit.path_target_regions - if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": + if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") != "": params["target"] = self.config.cnvkit.get("access") if self.w_config.static_data_config.get("features", None): params["annotate"] = self.w_config.static_data_config.features.path diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index fb32e1710..fe08248ff 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -29,9 +29,10 @@ if m: avg_size = int(float(m.groups()[4])) break - -else: +elif "avg_size" in args: avg_size = args["avg_size"] +else: + avg_size = None cmd = r""" cnvkit.py target \ @@ -42,7 +43,7 @@ snakemake=snakemake, args=args, interval=interval, - avg_size=f"--avg-size {avg_size}", + avg_size=f"--avg-size {avg_size}" if avg_size is not None else "", split=f"--split" if "split" in args and args["split"] else "", annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 46fd5a0fe..86870f4c8 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -323,7 +323,7 @@ def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): "mapper": "bwa", } ) - expected = {"annotate": "/path/to/annotations.gtf", "split": True, "target": ""} + expected = {"annotate": "/path/to/annotations.gtf", "split": True} actual = panel_of_normals_workflow.get_args("cnvkit", "target")(wildcards) assert actual == expected From a2733e34803b4a626dd643dbab28ebe43f20bce4 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 13 Nov 2024 17:10:53 +0100 Subject: [PATCH 11/46] refactor: configuration for tools appearing in multiple steps addded to the generic models --- snappy_pipeline/models/cnvkit.py | 310 ++++++++----- snappy_pipeline/models/library_kit.py | 29 ++ snappy_pipeline/models/mutect2.py | 612 ++++++++++++++++++++++++++ 3 files changed, 848 insertions(+), 103 deletions(-) create mode 100644 snappy_pipeline/models/library_kit.py create mode 100644 snappy_pipeline/models/mutect2.py diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index e80ae634f..f1782e588 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -1,11 +1,37 @@ import enum -from typing import Annotated +from typing import Self -from pydantic import Field +from pydantic import model_validator from snappy_pipeline.models import SnappyModel +class SexOrigin(enum.StrEnum): + AUTOMATIC = "auto" + """Sex determined from the data""" + SAMPLESHEET = "samplesheet" + """Donor sex obtained from sample sheet""" + CONFIG = "config" + """Donor sex obtained from the configuration (all donors have the same sex)""" + + +class SexValue(enum.StrEnum): + MALE = "male" + FEMALE = "female" + + +class Sex(SnappyModel): + source: SexOrigin = SexOrigin.AUTOMATIC + + sample_sex: SexValue | None = None + + @model_validator(mode="after") + def ensure_valid_sex_value(self): + if self.source == SexOrigin.CONFIG and self.sample_sex is None: + raise ValueError("No definition of donors' sex from the configuration") + return self + + class SegmentationMethod(enum.StrEnum): cbs = "cbs" flasso = "flasso" @@ -16,14 +42,14 @@ class SegmentationMethod(enum.StrEnum): none = "none" -class CenterMode(enum.StrEnum): +class CenterMethod(enum.StrEnum): mean = "mean" median = "median" mode = "mode" biweight = "biweight" -class FilterMode(enum.StrEnum): +class FilterMethod(enum.StrEnum): ampdel = "ampdel" cn = "cn" ci = "ci" @@ -36,129 +62,207 @@ class CallingMethod(enum.StrEnum): none = "" -class Gender(enum.StrEnum): - male = "male" - female = "female" - guess = "" +class Access(SnappyModel): + exclude: list[str] = [] + """Regions accessible to mapping""" + min_gap_size: int = 5000 + """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" -class Cnvkit(SnappyModel): - path_target: Annotated[ - str, Field(examples=["../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed"]) - ] - """Path to target regions""" +class Target(SnappyModel): + path_baits: str | None = None + """Path to baits file (Agilent Covered), unset for WGS data""" + split: bool = True + """Split large tiled intervals into smaller, consecutive targets.""" + avg_size: float = (800/3) + """Average size of split target bins (results are approximate)""" + short_names: bool = False + """Reduce multi-accession bait labels to be short and consistent""" - path_antitarget: Annotated[ - str, - Field(examples=["../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed"]), - ] - """Path to antitarget regions""" - path_panel_of_normals: Annotated[ - str, - Field( - examples=[ - "../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn" - ] - ), - ] - """Path to panel of normals (reference)""" +class Antitarget(SnappyModel): + avg_size: float = 150000 + """Average size of split antitarget bins (results are approximate)""" + min_size: float | None = None + """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" - plot: bool = True - """Generate plots (very slow)""" - - min_mapq: int = 0 - """[coverage] Mininum mapping quality score to count a read for coverage depth""" +class Coverage(SnappyModel): count: bool = False - """[coverage] Alternative counting algorithm""" - - gc_correction: bool = True - """[fix] Use GC correction""" - - edge_correction: bool = True - """[fix] Use edge correction""" + """Get read depths by counting read midpoints within each bin.""" + min_mapq: int = 0 + """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" - rmask_correction: bool = True - """[fix] Use rmask correction""" - # BCBIO uses - # seg_method: haar - # seg_threshold: 0.0001 - # -- OR - # seg_method: cbs - # seg_threshold: 0.000001 - segmentation_method: SegmentationMethod = SegmentationMethod.cbs - """[segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none""" - segmentation_threshold: float = 0.000001 - """[segment] Significance threshold (hmm methods: smoothing window size)""" +class Fix(SnappyModel): + smoothing_window_fraction: float | None = None + """Smoothing window fraction for rolling median bias smoothing. Defaults to 1/sqrt(len(data))""" - drop_low_coverage: bool = False - """[segment, call, genemetrics] Drop very low coverage bins""" +class Segment(SnappyModel): + method: SegmentationMethod = SegmentationMethod.CBS + """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" + threshold: float = 0.0001 + """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" drop_outliers: int = 10 - """[segment] Drop outlier bins (0 for no outlier filtering)""" - - smooth_cbs: bool = True - """[segment] Additional smoothing of CBS segmentation (WARNING- not the default value)""" - - center: CenterMode | float | None = None - """[call] Either one of mean, median, mode, biweight, or a constant log2 ratio value.""" - - filter: FilterMode | str = FilterMode.ampdel - """ - [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), - "" for no filtering - """ + """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" + smooth_cbs: bool = False + + @model_validator(mode="after") + def ensure_smooth_for_cbs_only(self) -> Self: + if self.smooth_cbs and self.method != SegmentationMethod.CBS: + raise ValueError("'smooth_cbs' option can be used only with 'CBS' segmentation method") + return self + + +class Call(SnappyModel): + method: CallingMethod | None = None + """Calling method.""" + thresholds: list[float] = [-1.1, -0.25, 0.2, 0.7] + """Hard thresholds for calling each integer copy number, separated by commas""" + center: CenterMethod = CenterMethod.MEDIAN + """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" + center_at: float | None = None + """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" + filter: FilterMethod | None = None + """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" + + @model_validator(mode="after") + def avoid_center_center_at_conflict(self) -> Self: + if self.center is not None and self.center_at is not None: + raise ValueError("'call' options 'center' and 'center_at' cannot be used together") + return self + + +class Bintest(SnappyModel): + alpha: float = 0.005 + """Significance threhold.""" + target: bool = False + """Test target bins only; ignore off-target bins.""" + + +class Plot(SnappyModel): + enabled: bool = False + + +class PlotDiagram(Plot): + chromosome: str | None = None + """Chromosome to display (full genome when missing)""" + threshold: float = 0.5 + """Copy number change threshold to label genes.""" + min_probes: int = 3 + """Minimum number of covered probes to label a gene.""" + no_shift_xy: bool = False + + +class PlotScatter(Plot): + path_range_list: str | None = None + """File listing the chromosomal ranges to display, as BED, interval list or 'chr:start-end' text""" + gene: str | None = None + """Name of gene or genes (comma-separated) to display.""" + width: int = 1000000 + """Width of margin to show around the selected gene(s)""" + antitarget_marker: str = "o" + """Plot antitargets using this symbol when plotting in a selected chromosomal region.""" + by_bin: bool = False + """Plot data x-coordinates by bin indices instead of genomic coordinates.""" + segment_color: str = "darkorange" + """Plot segment lines in this color. Value can be any string accepted by matplotlib.""" + trend: bool = False + """Draw a smoothed local trendline on the scatter plot.""" + y_max: float | None = None + """y-axis upper limit.""" + y_min: float | None = None + """y-axis lower limit.""" + fig_size: tuple[float, float] = (6.4, 4.8) + """Width and height of the plot in inches.""" + + @model_validator(mode="after") + def ensure_range_list_with_gene(self) -> Self: + if self.gene is not None and not self.path_range_list: + raise ValueError("'gene' option requires a valid range list") + return self + + +class Report(SnappyModel): + enabled: bool = True + + +class ReportSegmetrics(Report): + alpha: float = 0.05 + """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" + bootstrap: int = 100 + """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" + smooth_bootstrap: bool = False + """Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval""" - calling_method: CallingMethod = CallingMethod.threshold - """[call] One of threshold, clonal, none""" - call_thresholds: str = "-1.1,-0.25,0.2,0.7" - """[call] Thresholds for calling integer copy number""" +class ReportGenemetrics(Report): + alpha: float = 0.05 + """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" + bootstrap: int = 100 + """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" + threshold: float = 0.2 + """Copy number change threshold to report a gene gain/loss""" + min_probes: int = 3 + """Minimum number of covered probes to report a gain/loss""" - ploidy: int = 2 - """[call] Ploidy of sample cells""" - purity: Annotated[float, Field(0, ge=0, le=1)] - """[call] Estimated tumor cell fraction (0 for discarding tumor cell purity)""" - gender: Gender = Gender.guess - """ - [call, diagram] Specify the chromosomal sex of all given samples as male or female. - Guess when missing - """ +class Report(enum.StrEnum): + GENEMETRICS = "genemetrics" + SEGMETRICS = "segmetrics" - male_reference: bool = False - """[call, diagram] Create male reference""" - diagram_threshold: float = 0.5 - """[diagram] Copy number change threshold to label genes""" - diagram_min_probes: int = 3 - """[diagram] Min number of covered probes to label genes""" +class CnvkitToReference(SnappyModel): + # Substep-secific parameters + access: Access + target: Target + antitarget: Antitarget - shift_xy: bool = True - """[diagram] Shift X & Y chromosomes according to sample sex""" + coverage: Coverage - breaks_min_probes: int = 1 - """[breaks] Min number of covered probes for a break inside the gene""" + metrics: Report + segmetrics: ReportSegmetrics + genemetrics: ReportGenemetrics - genemetrics_min_probes: int = 3 - """[genemetrics] Min number of covered probes to consider a gene""" + # Generic parameters (used in different substeps & must agree) + male_reference: bool = False + """Create/use male reference (for shifting chrX & chrY)""" + diploid_parx_genome: str | None = None + """Considers the given human genome's PAR of chromosome X as autosomal. Example: 'grch38'""" + + cluster: bool = False + """Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles.""" + min_cluster_size: int = 4 + """Minimum cluster size to keep in reference profiles.""" + + gc: bool = False + """Skip GC correction.""" + edge: bool = None + """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" + rmask: bool = False + """Skip RepeatMasker correction.""" - genemetrics_threshold: float = 0.2 - """[genemetrics] Min abs log2 change to consider a gene""" + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - genemetrics_alpha: float = 0.05 - """[genemetrics] Significance cutoff""" + @model_validator(mode="after") + def ensure_males_for_reference(self): + if self.male_reference and self.sex.source == SexOrigin.CONFIG and self.sex.sample_sex == SexValue.FEMALE: + raise ValueError("Male reference requested for female cohort") + return self - genemetrics_bootstrap: int = 100 - """[genemetrics] Number of bootstraps""" - segmetrics_alpha: float = 0.05 - """[segmetrics] Significance cutoff""" +class Cnvkit(CnvkitToReference): + fix: Fix + segment: Segment + call: Call + bintest: Bintest - segmetrics_bootstrap: int = 100 - """[segmetrics] Number of bootstraps""" + diagram: PlotDiagram + scatter: PlotScatter - smooth_bootstrap: bool = False - """[segmetrics] Smooth bootstrap results""" + min_variant_depth: int = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" diff --git a/snappy_pipeline/models/library_kit.py b/snappy_pipeline/models/library_kit.py new file mode 100644 index 000000000..bd861aa72 --- /dev/null +++ b/snappy_pipeline/models/library_kit.py @@ -0,0 +1,29 @@ +from typing import Annotated +from pydantic import Field + +from snappy_pipeline.models import SnappyModel + + +class LibraryKitEntry(SnappyModel): + """ + Mapping from enrichment kit to target region BED file, for either computing per--target + region coverage or selecting targeted exons. + + The following will match both the stock IDT library kit and the ones + with spike-ins seen fromr Yale genomics. The path above would be + mapped to the name "default". + - name: IDT_xGen_V1_0 + pattern: "xGen Exome Research Panel V1\\.0*" + path: "path/to/targets.bed" + """ + + name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] + + pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] + + path: Annotated[str, Field(examples=["path/to/targets.bed"])] + + +class LibraryKit(SnappyModel): + path_target_interval_list_mapping: list[LibraryKitEntry] = [] + """Connects sample-based library kit in sample sheets with corresponding bed files""" diff --git a/snappy_pipeline/models/mutect2.py b/snappy_pipeline/models/mutect2.py new file mode 100644 index 000000000..d0ce7ec4b --- /dev/null +++ b/snappy_pipeline/models/mutect2.py @@ -0,0 +1,612 @@ +from enum import StrEnum + +from snappy_pipeline.models import SnappyModel + + +class Annotation(StrEnum): + AS_BASEQUALITYRANKSUMTEST = 'AS_BaseQualityRankSumTest' + AS_FISHERSTRAND = 'AS_FisherStrand' + AS_INBREEDINGCOEFF = 'AS_InbreedingCoeff' + AS_MAPPINGQUALITYRANKSUMTEST = 'AS_MappingQualityRankSumTest' + AS_QUALBYDEPTH = 'AS_QualByDepth' + AS_RMSMAPPINGQUALITY = 'AS_RMSMappingQuality' + AS_READPOSRANKSUMTEST = 'AS_ReadPosRankSumTest' + AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' + AS_STRANDODDSRATIO = 'AS_StrandOddsRatio' + ALLELEFRACTION = 'AlleleFraction' + ALLELEPSEUDODEPTH = 'AllelePseudoDepth' + ASSEMBLYCOMPLEXITY = 'AssemblyComplexity' + BASEQUALITY = 'BaseQuality' + BASEQUALITYHISTOGRAM = 'BaseQualityHistogram' + BASEQUALITYRANKSUMTEST = 'BaseQualityRankSumTest' + CHROMOSOMECOUNTS = 'ChromosomeCounts' + CLIPPINGRANKSUMTEST = 'ClippingRankSumTest' + COUNTNS = 'CountNs' + COVERAGE = 'Coverage' + CYCLESKIPSTATUS = 'CycleSkipStatus' + DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' + DEPTHPERSAMPLEHC = 'DepthPerSampleHC' + EXCESSHET = 'ExcessHet' + FEATURIZEDREADSETS = 'FeaturizedReadSets' + FISHERSTRAND = 'FisherStrand' + FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' + FRAGMENTLENGTH = 'FragmentLength' + GCCONTENT = 'GcContent' + GENOTYPESUMMARIES = 'GenotypeSummaries' + HAPLOTYPEFILTERINGANNOTATION = 'HaplotypeFilteringAnnotation' + HMERINDELLENGTH = 'HmerIndelLength' + HMERINDELNUC = 'HmerIndelNuc' + HMERMOTIFS = 'HmerMotifs' + INBREEDINGCOEFF = 'InbreedingCoeff' + INDELCLASSIFY = 'IndelClassify' + INDELLENGTH = 'IndelLength' + LIKELIHOODRANKSUMTEST = 'LikelihoodRankSumTest' + MAPPINGQUALITY = 'MappingQuality' + MAPPINGQUALITYRANKSUMTEST = 'MappingQualityRankSumTest' + MAPPINGQUALITYZERO = 'MappingQualityZero' + ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' + ORIGINALALIGNMENT = 'OriginalAlignment' + POSSIBLEDENOVO = 'PossibleDeNovo' + QUALBYDEPTH = 'QualByDepth' + RMSMAPPINGQUALITY = 'RMSMappingQuality' + RAWGTCOUNT = 'RawGtCount' + READPOSRANKSUMTEST = 'ReadPosRankSumTest' + READPOSITION = 'ReadPosition' + REFERENCEBASES = 'ReferenceBases' + SAMPLELIST = 'SampleList' + STRANDBIASBYSAMPLE = 'StrandBiasBySample' + STRANDODDSRATIO = 'StrandOddsRatio' + TANDEMREPEAT = 'TandemRepeat' + UNIQUEALTREADCOUNT = 'UniqueAltReadCount' + VARIANTTYPE = 'VariantType' + + + +class AnnotationGroup(StrEnum): + AS_STANDARDANNOTATION = 'AS_StandardAnnotation' + ALLELESPECIFICANNOTATION = 'AlleleSpecificAnnotation' + GENOTYPEANNOTATION = 'GenotypeAnnotation' + INFOFIELDANNOTATION = 'InfoFieldAnnotation' + JUMBOGENOTYPEANNOTATION = 'JumboGenotypeAnnotation' + JUMBOINFOANNOTATION = 'JumboInfoAnnotation' + REDUCIBLEANNOTATION = 'ReducibleAnnotation' + STANDARDANNOTATION = 'StandardAnnotation' + STANDARDFLOWBASEDANNOTATION = 'StandardFlowBasedAnnotation' + STANDARDHCANNOTATION = 'StandardHCAnnotation' + STANDARDMUTECTANNOTATION = 'StandardMutectAnnotation' + VARIANTANNOTATION = 'VariantAnnotation' + + + +class AnnotationExclude(StrEnum): + AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' + BASEQUALITY = 'BaseQuality' + COVERAGE = 'Coverage' + DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' + DEPTHPERSAMPLEHC = 'DepthPerSampleHC' + FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' + FRAGMENTLENGTH = 'FragmentLength' + MAPPINGQUALITY = 'MappingQuality' + ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' + READPOSITION = 'ReadPosition' + STRANDBIASBYSAMPLE = 'StrandBiasBySample' + TANDEMREPEAT = 'TandemRepeat' + + + +class DisableReadFilter(StrEnum): + GOODCIGARREADFILTER = 'GoodCigarReadFilter' + MAPPEDREADFILTER = 'MappedReadFilter' + MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' + MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' + MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' + NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' + NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' + NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' + PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' + READLENGTHREADFILTER = 'ReadLengthReadFilter' + WELLFORMEDREADFILTER = 'WellformedReadFilter' + + + +class IntervalMergingRule(StrEnum): + ALL = 'ALL' + OVERLAPPING_ONLY = 'OVERLAPPING_ONLY' + + + +class IntervalSetRule(StrEnum): + INTERSECTION = 'INTERSECTION' + UNION = 'UNION' + + + +class ReadFilter(StrEnum): + ALIGNMENTAGREESWITHHEADERREADFILTER = 'AlignmentAgreesWithHeaderReadFilter' + ALLOWALLREADSREADFILTER = 'AllowAllReadsReadFilter' + AMBIGUOUSBASEREADFILTER = 'AmbiguousBaseReadFilter' + CIGARCONTAINSNONOPERATOR = 'CigarContainsNoNOperator' + EXCESSIVEENDCLIPPEDREADFILTER = 'ExcessiveEndClippedReadFilter' + FIRSTOFPAIRREADFILTER = 'FirstOfPairReadFilter' + FLOWBASEDTPATTRIBUTESYMETRICREADFILTER = 'FlowBasedTPAttributeSymetricReadFilter' + FLOWBASEDTPATTRIBUTEVALIDREADFILTER = 'FlowBasedTPAttributeValidReadFilter' + FRAGMENTLENGTHREADFILTER = 'FragmentLengthReadFilter' + GOODCIGARREADFILTER = 'GoodCigarReadFilter' + HASREADGROUPREADFILTER = 'HasReadGroupReadFilter' + HMERQUALITYSYMETRICREADFILTER = 'HmerQualitySymetricReadFilter' + INTERVALOVERLAPREADFILTER = 'IntervalOverlapReadFilter' + JEXLEXPRESSIONREADTAGVALUEFILTER = 'JexlExpressionReadTagValueFilter' + LIBRARYREADFILTER = 'LibraryReadFilter' + MAPPEDREADFILTER = 'MappedReadFilter' + MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' + MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' + MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' + MATCHINGBASESANDQUALSREADFILTER = 'MatchingBasesAndQualsReadFilter' + MATEDIFFERENTSTRANDREADFILTER = 'MateDifferentStrandReadFilter' + MATEDISTANTREADFILTER = 'MateDistantReadFilter' + MATEONSAMECONTIGORNOMAPPEDMATEREADFILTER = 'MateOnSameContigOrNoMappedMateReadFilter' + MATEUNMAPPEDANDUNMAPPEDREADFILTER = 'MateUnmappedAndUnmappedReadFilter' + METRICSREADFILTER = 'MetricsReadFilter' + NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' + NONZEROFRAGMENTLENGTHREADFILTER = 'NonZeroFragmentLengthReadFilter' + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' + NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' + NOTOPTICALDUPLICATEREADFILTER = 'NotOpticalDuplicateReadFilter' + NOTPROPERLYPAIREDREADFILTER = 'NotProperlyPairedReadFilter' + NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' + NOTSUPPLEMENTARYALIGNMENTREADFILTER = 'NotSupplementaryAlignmentReadFilter' + OVERCLIPPEDREADFILTER = 'OverclippedReadFilter' + PAIREDREADFILTER = 'PairedReadFilter' + PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' + PLATFORMREADFILTER = 'PlatformReadFilter' + PLATFORMUNITREADFILTER = 'PlatformUnitReadFilter' + PRIMARYLINEREADFILTER = 'PrimaryLineReadFilter' + PROPERLYPAIREDREADFILTER = 'ProperlyPairedReadFilter' + READGROUPBLACKLISTREADFILTER = 'ReadGroupBlackListReadFilter' + READGROUPHASFLOWORDERREADFILTER = 'ReadGroupHasFlowOrderReadFilter' + READGROUPREADFILTER = 'ReadGroupReadFilter' + READLENGTHEQUALSCIGARLENGTHREADFILTER = 'ReadLengthEqualsCigarLengthReadFilter' + READLENGTHREADFILTER = 'ReadLengthReadFilter' + READNAMEREADFILTER = 'ReadNameReadFilter' + READSTRANDFILTER = 'ReadStrandFilter' + READTAGVALUEFILTER = 'ReadTagValueFilter' + SAMPLEREADFILTER = 'SampleReadFilter' + SECONDOFPAIRREADFILTER = 'SecondOfPairReadFilter' + SEQISSTOREDREADFILTER = 'SeqIsStoredReadFilter' + SOFTCLIPPEDREADFILTER = 'SoftClippedReadFilter' + VALIDALIGNMENTENDREADFILTER = 'ValidAlignmentEndReadFilter' + VALIDALIGNMENTSTARTREADFILTER = 'ValidAlignmentStartReadFilter' + WELLFORMEDFLOWBASEDREADFILTER = 'WellformedFlowBasedReadFilter' + WELLFORMEDREADFILTER = 'WellformedReadFilter' + + + +class ValidationStringency(StrEnum): + LENIENT = 'LENIENT' + SILENT = 'SILENT' + STRICT = 'STRICT' + + + +class LogLevel(StrEnum): + DEBUG = 'DEBUG' + ERROR = 'ERROR' + INFO = 'INFO' + WARNING = 'WARNING' + + + +class WriterType(StrEnum): + ALL_POSSIBLE_HAPLOTYPES = 'ALL_POSSIBLE_HAPLOTYPES' + CALLED_HAPLOTYPES = 'CALLED_HAPLOTYPES' + CALLED_HAPLOTYPES_NO_READS = 'CALLED_HAPLOTYPES_NO_READS' + NO_HAPLOTYPES = 'NO_HAPLOTYPES' + + + +class ReferenceConfidenceMode(StrEnum): + BP_RESOLUTION = 'BP_RESOLUTION' + GVCF = 'GVCF' + NONE = 'NONE' + + + +class FlowMode(StrEnum): + ADVANCED = 'ADVANCED' + NONE = 'NONE' + STANDARD = 'STANDARD' + + + +class Implementation(StrEnum): + FLOWBASED = 'FlowBased' + FLOWBASEDHMM = 'FlowBasedHMM' + PAIRHMM = 'PairHMM' + + + +class PairHMMImplementation(StrEnum): + AVX_LOGLESS_CACHING = 'AVX_LOGLESS_CACHING' + AVX_LOGLESS_CACHING_OMP = 'AVX_LOGLESS_CACHING_OMP' + EXACT = 'EXACT' + FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' + LOGLESS_CACHING = 'LOGLESS_CACHING' + ORIGINAL = 'ORIGINAL' + + + +class PCRErrorModel(StrEnum): + AGGRESSIVE = 'AGGRESSIVE' + CONSERVATIVE = 'CONSERVATIVE' + HOSTILE = 'HOSTILE' + NONE = 'NONE' + + + +class SmithWatermanImplementation(StrEnum): + AVX_ENABLED = 'AVX_ENABLED' + FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' + JAVA = 'JAVA' + + + +class Mutect2(SnappyModel): + # Required arguments + # input: bams tumor + normal + # output: raw vcf + # reference: fasta + + # Arguments actually used + + genotype_germline_sites: bool = False + """Call all apparent germline site even though they will ultimately be filtered""" + # germline_resource: FeatureInput | None = None # No options for class FeatureInput + germline_resource: str | None = None # No options for class FeatureInput + """Population vcf of germline sequencing containing allele fractions""" + + # Arguments that must be set by derived classes (pon & calling) + + # Panel of normals arguments + + + # Calling-specific arguments + + # panel_of_normals: str | None = None # Was class FeatureInput + # """VCF file of sites observed in normal""" + # genotype_pon_sites: bool = False + # """Call sites in the PoN even though they will ultimately be filtered""" + # mitochondria_mode: bool = False + # """Mitochondria mode sets emission and initial LODs to 0""" + # add_output_sam_program_record: bool = True + # """If true, adds a PG tag to created SAM/BAM/CRAM files""" + # assembly_region_out: str | None = None + # """Output the assembly region to this IGV formatted file""" + # bam_writer_type: WriterType = WriterType.CALLED_HAPLOTYPES + # """Which haplotypes should be written to the BAM""" + # enable_all_annotations: bool = False + # """Use all possible annotations (not for the faint of heart)""" + # alleles: str | None = None # Was class FeatureInput + # """The set of alleles to force-call regardless of evidence""" + # bam_output: bool = False # Was class str + # """Write assembled haplotypes""" + # pair_hmm_results_file: bool = False # Was class GATKPath + # """Write exact pairHMM inputs/outputs to for debugging purposes""" + + + # Optional arguments + + add_output_vcf_command_line: bool = True + """If true, adds a command line header line to created VCF files""" + af_of_alleles_not_in_resource: float = -1.0 + """Population allele fraction assigned to alleles not found in germline resource. Please see docs/mutect/mutect2.pdf fora derivation of the default value""" + annotation: list[Annotation] = [] + """One or more specific annotations to add to variant calls""" + annotation_group: list[AnnotationGroup] = [] + """One or more groups of annotations to apply to variant calls""" + annotations_to_exclude: list[AnnotationExclude] = [] + """One or more specific annotations to exclude from variant calls""" + arguments_file: str | None = None # was class File + """read one or more arguments files and add them to the command line""" + assembly_region_padding: int = 100 + """Number of additional bases of context to include around each assembly region""" + base_quality_score_threshold: int = 18 + """Base qualities below this threshold will be reduced to the minimum (6)""" + callable_depth: int = 10 + """Minimum depth to be considered callable for Mutect stats. Does not affect genotyping""" + disable_bam_index_caching: bool = False + """If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified. Caching is automatically disabled if there are no intervals specified""" + disable_read_filter: list[DisableReadFilter] = [] + """Read filters to be disabled before analysis""" + dont_use_dragstr_pair_hmm_scores: bool = False + """disable DRAGstr pair-hmm score even when dragstr-params-path was provided""" + downsampling_stride: int = 1 + """Downsample a pool of reads starting within a range of one or more bases""" + dragstr_het_hom_ratio: int = 2 + """het to hom prior ratio use with DRAGstr on""" + enable_dynamic_read_disqualification_for_genotyping: bool = False + """Will enable less strict read disqualification low base quality reads""" + exclude_intervals: list[str] = [] + """One or more genomic intervals to exclude from processing""" + f1r2_max_depth: int = 200 + """sites with depth higher than this value will be grouped""" + f1r2_median_mq: int = 50 + """skip sites with median mapping quality below this value""" + f1r2_min_bq: int = 20 + """exclude bases below this quality from pileup""" + flow_order_for_annotations: list[str] = [] + """flow order used for this annotations. [readGroup:]flowOrder""" + founder_id: list[str] = [] + """Samples representing the population 'founders'""" + gatk_config_file: str | None = None + """A configuration file to use with the GATK""" + ignore_itr_artifacts: bool = False + """Turn off read transformer that clips artifacts associated with end repair insertions near inverted tandem repeats""" + initial_tumor_lod: float = 2.0 + """Log 10 odds threshold to consider pileup active""" + interval_exclusion_padding: int = 0 + """Amount of padding (in bp) to add to each interval you are excluding""" + interval_merging_rule: IntervalMergingRule = IntervalMergingRule.ALL + """Interval merging rule for abutting intervals""" + interval_padding: int = 0 + """Amount of padding (in bp) to add to each interval you are including""" + interval_set_rule: IntervalSetRule = IntervalSetRule.UNION + """Set merging approach to use for combining interval inputs""" + intervals: list[str] = [] + """One or more genomic intervals over which to operate""" + lenient: bool = False + """Lenient processing of VCF files""" + max_assembly_region_size: int = 300 + """Maximum size of an assembly region""" + max_population_af: float = 0.01 + """Maximum population allele frequency in tumor-only mode""" + max_reads_per_alignment_start: int = 50 + """Maximum number of reads to retain per alignment start position. Reads above this threshold will be downsampled. Set to 0 to disable""" + max_variants_per_shard: int = 0 + """If non-zero, partitions VCF output into shards, each containing up to the given number of records""" + min_assembly_region_size: int = 50 + """Minimum size of an assembly region""" + min_base_quality_score: int = 10 + """Minimum base quality required to consider a base for calling""" + native_pair_hmm_use_double_precision: bool = False + """use double precision in the native pairHmm. This is slower but matches the java implementation better""" + normal_lod: float = 2.2 + """Log 10 odds threshold for calling normal variant non-germline""" + pcr_indel_qual: int = 40 + """Phred-scaled PCR indel qual for overlapping fragments""" + pcr_snv_qual: int = 40 + """Phred-scaled PCR SNV qual for overlapping fragments""" + read_filter: list[ReadFilter] = [] + """Read filters to be applied before analysis""" + read_validation_stringency: ValidationStringency = ValidationStringency.SILENT + """Validation stringency for all SAM/BAM/CRAM/SRA files read by this program. The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded""" + sites_only_vcf_output: bool = False + """If true, don't emit genotype fields when writing vcf file output""" + tumor_lod_to_emit: float = 3.0 + """Log 10 odds threshold to emit variant to VCF""" + use_jdk_deflater: bool = False + """Whether to use the JdkDeflater (as opposed to IntelDeflater)""" + use_jdk_inflater: bool = False + """Whether to use the JdkInflater (as opposed to IntelInflater)""" + verbosity: LogLevel = LogLevel.INFO + """Control verbosity of logging""" + QUIET: bool = False + """Whether to suppress job-summary info on System.err""" + + # Advanced arguments + + active_probability_threshold: float = 0.002 + """Minimum probability for a locus to be considered active""" + adaptive_pruning_initial_error_rate: float = 0.001 + """Initial base error rate estimate for adaptive pruning""" + allele_informative_reads_overlap_margin: int = 2 + """Likelihood and read-based annotations will only take into consideration reads that overlap the variant or any base no further than this distance expressed in base pairs""" + allow_non_unique_kmers_in_ref: bool = False + """Allow graphs that have non-unique kmers in the reference""" + debug_assembly: bool = False + """Print out verbose debug information about each assembly region""" + disable_adaptive_pruning: bool = False + """Disable the adaptive algorithm for pruning paths in the graph""" + disable_cap_base_qualities_to_map_quality: bool = False + """If false this disables capping of base qualities in the HMM to the mapping quality of the read""" + disable_symmetric_hmm_normalizing: bool = False + """Toggle to revive legacy behavior of asymmetrically normalizing the arguments to the reference haplotype""" + disable_tool_default_annotations: bool = False + """Disable all tool default annotations""" + disable_tool_default_read_filters: bool = False + """Disable all tool default read filters (WARNING: many tools will not function correctly without their default read filters on)""" + dont_increase_kmer_sizes_for_cycles: bool = False + """Disable iterating over kmer sizes when graph cycles are detected""" + dragstr_params_path: str | None = None # Was class GATKPath + """location of the DRAGstr model parameters for STR error correction used in the Pair HMM. When provided, it overrides other PCR error correcting mechanisms""" + emit_ref_confidence: ReferenceConfidenceMode = ReferenceConfidenceMode.NONE + """Mode for emitting reference confidence scores (For Mutect2, this is a BETA feature)""" + expected_mismatch_rate_for_read_disqualification: float = 0.02 + """Error rate used to set expectation for post HMM read disqualification based on mismatches""" + flow_assembly_collapse_partial_mode: bool = False + """Collapse long flow-based hmers only up to difference in reference""" + flow_disallow_probs_larger_than_call: bool = False + """Cap probabilities of error to 1 relative to base call""" + flow_fill_empty_bins_value: float = 0.001 + """Value to fill the zeros of the matrix with""" + flow_filter_alleles: bool = False + """pre-filter alleles before genotyping""" + flow_filter_alleles_qual_threshold: float = 30.0 + """Threshold for prefiltering alleles on quality""" + flow_filter_alleles_sor_threshold: float = 3.0 + """Threshold for prefiltering alleles on SOR""" + flow_filter_lone_alleles: bool = False + """Remove also lone alleles during allele filtering""" + flow_lump_probs: bool = False + """Should all probabilities of insertion or deletion in the flow be combined together""" + flow_matrix_mods: str | None = None + """Modifications instructions to the read flow matrix. Format is src,dst{,src,dst}+. Example: 10,12,11,12 - these instructions will copy element 10 into 11 and 12""" + flow_mode: FlowMode = FlowMode.NONE + """Single argument for enabling the bulk of Flow Based features. NOTE: THIS WILL OVERWRITE PROVIDED ARGUMENT CHECK TOOL INFO TO SEE WHICH ARGUMENTS ARE SET)""" + flow_probability_scaling_factor: int = 10 + """probability scaling factor for (phred=10) for probability quantization""" + flow_probability_threshold: float = 0.003 + """Lowest probability ratio to be used as an option""" + flow_quantization_bins: int = 121 + """Number of bins for probability quantization""" + flow_remove_non_single_base_pair_indels: bool = False + """Should the probabilities of more then 1 indel be used""" + flow_remove_one_zero_probs: bool = False + """Remove probabilities of basecall of zero from non-zero genome""" + flow_report_insertion_or_deletion: bool = False + """Report either insertion or deletion, probability, not both""" + flow_retain_max_n_probs_base_format: bool = False + """Keep only hmer/2 probabilities (like in base format)""" + flow_symmetric_indel_probs: bool = False + """Should indel probabilities be symmetric in flow""" + flow_use_t0_tag: bool = False + """Use t0 tag if exists in the read to create flow matrix""" + force_active: bool = False + """If provided, all regions will be marked as active""" + force_call_filtered_alleles: bool = False + """Force-call filtered alleles included in the resource specified by --alleles""" + graph_output: bool = False # Was class str + """Write debug assembly graph information to this file""" + gvcf_lod_band: list[float] = [-2.5, -2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0] + """Exclusive upper bounds for reference confidence LOD bands (must be specified in increasing order)""" + independent_mates: bool = False + """Allow paired reads to independently support different haplotypes. Useful for validations with ill-designed synthetic data""" + keep_boundary_flows: bool = False + """prevent spreading of boundary flows""" + kmer_size: list[int] = [10, 25] + """Kmer size to use in the read threading assembler""" + likelihood_calculation_engine: Implementation = Implementation.PAIRHMM + """What likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes""" + linked_de_bruijn_graph: bool = False + """If enabled, the Assembly Engine will construct a Linked De Bruijn graph to recover better haplotypes""" + max_mnp_distance: int = 1 + """Two or more phased substitutions separated by this distance or less are merged into MNPs""" + max_num_haplotypes_in_population: int = 128 + """Maximum number of haplotypes to consider for your population""" + max_prob_propagation_distance: int = 50 + """Upper limit on how many bases away probability mass can be moved around when calculating the boundaries between active and inactive assembly regions""" + max_suspicious_reads_per_alignment_start: int = 0 + """Maximum number of suspicious reads (mediocre mapping quality or too many substitutions) allowed in a downsampling stride. Set to 0 to disable""" + max_unpruned_variants: int = 100 + """Maximum number of variants in graph the adaptive pruner will allow""" + min_dangling_branch_length: int = 4 + """Minimum length of a dangling branch to attempt recovery""" + min_pruning: int = 2 + """Minimum support to not prune paths in the graph""" + minimum_allele_fraction: float = 0.0 + """Lower bound of variant allele fractions to consider when calculating variant LOD""" + num_pruning_samples: int = 1 + """Number of samples that must pass the minPruning threshold""" + pair_hmm_gap_continuation_penalty: int = 10 + """Flat gap continuation penalty for use in the Pair HMM""" + pair_hmm_implementation: PairHMMImplementation = PairHMMImplementation.FASTEST_AVAILABLE + """The PairHMM implementation to use for genotype likelihood calculations""" + pcr_indel_model: PCRErrorModel = PCRErrorModel.CONSERVATIVE + """The PCR indel model to use""" + pedigree: str | None = None # Was class GATKPath + """Pedigree file for determining the population 'founders'""" + phred_scaled_global_read_mismapping_rate: int = 45 + """The global assumed mismapping rate for reads""" + pileup_detection: bool = False + """If enabled, the variant caller will create pileup-based haplotypes in addition to the assembly-based haplotype generation""" + pruning_lod_threshold: float = 2.302585092994046 + """Ln likelihood ratio threshold for adaptive pruning algorithm""" + pruning_seeding_lod_threshold: float = 9.210340371976184 + """Ln likelihood ratio threshold for seeding subgraph of good variation in adaptive pruning algorithm""" + recover_all_dangling_branches: bool = False + """Recover all dangling branches""" + reference_model_deletion_quality: int = 30 + """The quality of deletion in the reference model""" + smith_waterman: SmithWatermanImplementation = SmithWatermanImplementation.JAVA + """Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice""" + smith_waterman_dangling_end_gap_extend_penalty: int = -6 + """Smith-Waterman gap-extend penalty for dangling-end recovery""" + smith_waterman_dangling_end_gap_open_penalty: int = -110 + """Smith-Waterman gap-open penalty for dangling-end recovery""" + smith_waterman_dangling_end_match_value: int = 25 + """Smith-Waterman match value for dangling-end recovery""" + smith_waterman_dangling_end_mismatch_penalty: int = -50 + """Smith-Waterman mismatch penalty for dangling-end recovery""" + smith_waterman_haplotype_to_reference_gap_extend_penalty: int = -11 + """Smith-Waterman gap-extend penalty for haplotype-to-reference alignment""" + smith_waterman_haplotype_to_reference_gap_open_penalty: int = -260 + """Smith-Waterman gap-open penalty for haplotype-to-reference alignment""" + smith_waterman_haplotype_to_reference_match_value: int = 200 + """Smith-Waterman match value for haplotype-to-reference alignment""" + smith_waterman_haplotype_to_reference_mismatch_penalty: int = -150 + """Smith-Waterman mismatch penalty for haplotype-to-reference alignment""" + smith_waterman_read_to_haplotype_gap_extend_penalty: int = -5 + """Smith-Waterman gap-extend penalty for read-to-haplotype alignment""" + smith_waterman_read_to_haplotype_gap_open_penalty: int = -30 + """Smith-Waterman gap-open penalty for read-to-haplotype alignment""" + smith_waterman_read_to_haplotype_match_value: int = 10 + """Smith-Waterman match value for read-to-haplotype alignment""" + smith_waterman_read_to_haplotype_mismatch_penalty: int = -15 + """Smith-Waterman mismatch penalty for read-to-haplotype alignment""" + soft_clip_low_quality_ends: bool = False + """If enabled will preserve low-quality read ends as softclips (used for DRAGEN-GATK BQD genotyper model)""" + + maximum_mapping_quality: int | None = None + """Maximum mapping quality to keep (inclusive)""" + minimum_mapping_quality: int = 20 + """Minimum mapping quality to keep (inclusive)""" + max_read_length: int = 2147483647 + """Keep only reads with length at most equal to the specified value""" + min_read_length: int = 30 + """Keep only reads with length at least equal to the specified value""" + + # Arguments omitted + + # f1r2_tar_gz: File | None = None # No options for class File + # """If specified, collect F1R2 counts and output files into this tar.gz file""" + # gcs_max_retries: int = 20 + # """If the GCS bucket channel errors out, how many times it will attempt to re-initiate the connection""" + # gcs_project_for_requester_pays: str = "." + # """Project to bill when accessing "requester pays" buckets. If unset, these buckets cannot be accessed. User must have storage.buckets.get permission on the bucket being accessed""" + # mutect3_alt_downsample: int = 20 + # """Downsample alt reads to this count for Mutect3 training datasets""" + # mutect3_dataset: File | None = None # No options for class File + # """Destination for Mutect3 data collection""" + # mutect3_non_artifact_ratio: int = 20 + # """Number of non-artifact data per artifact datum in Mutect3 training""" + # mutect3_ref_downsample: int = 10 + # """Downsample ref reads to this count when generating a Mutect3 dataset""" + # mutect3_training_mode: bool = False + # """Collect Mutect3 data for learning""" + # mutect3_training_truth: FeatureInput | None = None # No options for class FeatureInput + # """VCF file of known variants for labeling Mutect3 training data""" + # native_pair_hmm_threads: int = 4 + # """How many threads should a native pairHMM implementation use""" + # read_index: list[GATKPath] = [] # No options for class GATKPath + # """Indices to use for the read inputs. If specified, an index must be provided for every read input and in the same order as the read inputs. If this argument is not specified, the path to the index for each input will be inferred automatically""" + # sequence_dictionary: GATKPath | None = None # No options for class GATKPath + # """Use the given sequence dictionary as the master/canonical sequence dictionary. Must be a .dict file""" + # tmp_dir: GATKPath | None = None # No options for class GATKPath + # """Temp directory to use""" + # cloud_index_prefetch_buffer: int = -1 + # """Size of the cloud-only prefetch buffer (in MB; 0 to disable). Defaults to cloudPrefetchBuffer if unset""" + # cloud_prefetch_buffer: int = 40 + # """Size of the cloud-only prefetch buffer (in MB; 0 to disable)""" + # create_output_bam_index: bool = True + # """If true, create a BAM/CRAM index when writing a coordinate-sorted BAM/CRAM file""" + # create_output_bam_md5: bool = False + # """If true, create a MD5 digest for any BAM/SAM/CRAM file created""" + # create_output_variant_index: bool = True + # """If true, create a VCF index when writing a coordinate-sorted VCF file""" + # create_output_variant_md5: bool = False + # """If true, create a a MD5 digest any VCF file created""" + # disable_sequence_dictionary_validation: bool = False + # """If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!""" + # help: bool = False + # """display the help message""" + # seconds_between_progress_updates: float = 10.0 + # """Output traversal statistics every time this many seconds elapse""" + # version: bool = False + # """display the version number for this tool""" + # showHidden: bool = False + # """display hidden arguments""" + # normal_sample: list[str] = [] + # """BAM sample name of normal(s), if any. May be URL-encoded as output by GetSampleName with -encode argument""" + # tumor_sample: str | None = None + # """BAM sample name of tumor. May be URL-encoded as output by GetSampleName with -encode argument""" + From 65a9f5e61c96c74257d12e7bfcc1e32fdb5a3add Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 13 Nov 2024 17:36:00 +0100 Subject: [PATCH 12/46] style: making ruff happy --- snappy_pipeline/models/cnvkit.py | 8 +- snappy_pipeline/models/mutect2.py | 395 ++++++++++++++---------------- 2 files changed, 192 insertions(+), 211 deletions(-) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index f1782e588..402e567a6 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -74,7 +74,7 @@ class Target(SnappyModel): """Path to baits file (Agilent Covered), unset for WGS data""" split: bool = True """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float = (800/3) + avg_size: float = 800 / 3 """Average size of split target bins (results are approximate)""" short_names: bool = False """Reduce multi-accession bait labels to be short and consistent""" @@ -248,7 +248,11 @@ class CnvkitToReference(SnappyModel): @model_validator(mode="after") def ensure_males_for_reference(self): - if self.male_reference and self.sex.source == SexOrigin.CONFIG and self.sex.sample_sex == SexValue.FEMALE: + if ( + self.male_reference + and self.sex.source == SexOrigin.CONFIG + and self.sex.sample_sex == SexValue.FEMALE + ): raise ValueError("Male reference requested for female cohort") return self diff --git a/snappy_pipeline/models/mutect2.py b/snappy_pipeline/models/mutect2.py index d0ce7ec4b..5df87dcad 100644 --- a/snappy_pipeline/models/mutect2.py +++ b/snappy_pipeline/models/mutect2.py @@ -4,251 +4,235 @@ class Annotation(StrEnum): - AS_BASEQUALITYRANKSUMTEST = 'AS_BaseQualityRankSumTest' - AS_FISHERSTRAND = 'AS_FisherStrand' - AS_INBREEDINGCOEFF = 'AS_InbreedingCoeff' - AS_MAPPINGQUALITYRANKSUMTEST = 'AS_MappingQualityRankSumTest' - AS_QUALBYDEPTH = 'AS_QualByDepth' - AS_RMSMAPPINGQUALITY = 'AS_RMSMappingQuality' - AS_READPOSRANKSUMTEST = 'AS_ReadPosRankSumTest' - AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' - AS_STRANDODDSRATIO = 'AS_StrandOddsRatio' - ALLELEFRACTION = 'AlleleFraction' - ALLELEPSEUDODEPTH = 'AllelePseudoDepth' - ASSEMBLYCOMPLEXITY = 'AssemblyComplexity' - BASEQUALITY = 'BaseQuality' - BASEQUALITYHISTOGRAM = 'BaseQualityHistogram' - BASEQUALITYRANKSUMTEST = 'BaseQualityRankSumTest' - CHROMOSOMECOUNTS = 'ChromosomeCounts' - CLIPPINGRANKSUMTEST = 'ClippingRankSumTest' - COUNTNS = 'CountNs' - COVERAGE = 'Coverage' - CYCLESKIPSTATUS = 'CycleSkipStatus' - DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' - DEPTHPERSAMPLEHC = 'DepthPerSampleHC' - EXCESSHET = 'ExcessHet' - FEATURIZEDREADSETS = 'FeaturizedReadSets' - FISHERSTRAND = 'FisherStrand' - FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' - FRAGMENTLENGTH = 'FragmentLength' - GCCONTENT = 'GcContent' - GENOTYPESUMMARIES = 'GenotypeSummaries' - HAPLOTYPEFILTERINGANNOTATION = 'HaplotypeFilteringAnnotation' - HMERINDELLENGTH = 'HmerIndelLength' - HMERINDELNUC = 'HmerIndelNuc' - HMERMOTIFS = 'HmerMotifs' - INBREEDINGCOEFF = 'InbreedingCoeff' - INDELCLASSIFY = 'IndelClassify' - INDELLENGTH = 'IndelLength' - LIKELIHOODRANKSUMTEST = 'LikelihoodRankSumTest' - MAPPINGQUALITY = 'MappingQuality' - MAPPINGQUALITYRANKSUMTEST = 'MappingQualityRankSumTest' - MAPPINGQUALITYZERO = 'MappingQualityZero' - ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' - ORIGINALALIGNMENT = 'OriginalAlignment' - POSSIBLEDENOVO = 'PossibleDeNovo' - QUALBYDEPTH = 'QualByDepth' - RMSMAPPINGQUALITY = 'RMSMappingQuality' - RAWGTCOUNT = 'RawGtCount' - READPOSRANKSUMTEST = 'ReadPosRankSumTest' - READPOSITION = 'ReadPosition' - REFERENCEBASES = 'ReferenceBases' - SAMPLELIST = 'SampleList' - STRANDBIASBYSAMPLE = 'StrandBiasBySample' - STRANDODDSRATIO = 'StrandOddsRatio' - TANDEMREPEAT = 'TandemRepeat' - UNIQUEALTREADCOUNT = 'UniqueAltReadCount' - VARIANTTYPE = 'VariantType' - + AS_BASEQUALITYRANKSUMTEST = "AS_BaseQualityRankSumTest" + AS_FISHERSTRAND = "AS_FisherStrand" + AS_INBREEDINGCOEFF = "AS_InbreedingCoeff" + AS_MAPPINGQUALITYRANKSUMTEST = "AS_MappingQualityRankSumTest" + AS_QUALBYDEPTH = "AS_QualByDepth" + AS_RMSMAPPINGQUALITY = "AS_RMSMappingQuality" + AS_READPOSRANKSUMTEST = "AS_ReadPosRankSumTest" + AS_STRANDBIASMUTECTANNOTATION = "AS_StrandBiasMutectAnnotation" + AS_STRANDODDSRATIO = "AS_StrandOddsRatio" + ALLELEFRACTION = "AlleleFraction" + ALLELEPSEUDODEPTH = "AllelePseudoDepth" + ASSEMBLYCOMPLEXITY = "AssemblyComplexity" + BASEQUALITY = "BaseQuality" + BASEQUALITYHISTOGRAM = "BaseQualityHistogram" + BASEQUALITYRANKSUMTEST = "BaseQualityRankSumTest" + CHROMOSOMECOUNTS = "ChromosomeCounts" + CLIPPINGRANKSUMTEST = "ClippingRankSumTest" + COUNTNS = "CountNs" + COVERAGE = "Coverage" + CYCLESKIPSTATUS = "CycleSkipStatus" + DEPTHPERALLELEBYSAMPLE = "DepthPerAlleleBySample" + DEPTHPERSAMPLEHC = "DepthPerSampleHC" + EXCESSHET = "ExcessHet" + FEATURIZEDREADSETS = "FeaturizedReadSets" + FISHERSTRAND = "FisherStrand" + FRAGMENTDEPTHPERALLELEBYSAMPLE = "FragmentDepthPerAlleleBySample" + FRAGMENTLENGTH = "FragmentLength" + GCCONTENT = "GcContent" + GENOTYPESUMMARIES = "GenotypeSummaries" + HAPLOTYPEFILTERINGANNOTATION = "HaplotypeFilteringAnnotation" + HMERINDELLENGTH = "HmerIndelLength" + HMERINDELNUC = "HmerIndelNuc" + HMERMOTIFS = "HmerMotifs" + INBREEDINGCOEFF = "InbreedingCoeff" + INDELCLASSIFY = "IndelClassify" + INDELLENGTH = "IndelLength" + LIKELIHOODRANKSUMTEST = "LikelihoodRankSumTest" + MAPPINGQUALITY = "MappingQuality" + MAPPINGQUALITYRANKSUMTEST = "MappingQualityRankSumTest" + MAPPINGQUALITYZERO = "MappingQualityZero" + ORIENTATIONBIASREADCOUNTS = "OrientationBiasReadCounts" + ORIGINALALIGNMENT = "OriginalAlignment" + POSSIBLEDENOVO = "PossibleDeNovo" + QUALBYDEPTH = "QualByDepth" + RMSMAPPINGQUALITY = "RMSMappingQuality" + RAWGTCOUNT = "RawGtCount" + READPOSRANKSUMTEST = "ReadPosRankSumTest" + READPOSITION = "ReadPosition" + REFERENCEBASES = "ReferenceBases" + SAMPLELIST = "SampleList" + STRANDBIASBYSAMPLE = "StrandBiasBySample" + STRANDODDSRATIO = "StrandOddsRatio" + TANDEMREPEAT = "TandemRepeat" + UNIQUEALTREADCOUNT = "UniqueAltReadCount" + VARIANTTYPE = "VariantType" class AnnotationGroup(StrEnum): - AS_STANDARDANNOTATION = 'AS_StandardAnnotation' - ALLELESPECIFICANNOTATION = 'AlleleSpecificAnnotation' - GENOTYPEANNOTATION = 'GenotypeAnnotation' - INFOFIELDANNOTATION = 'InfoFieldAnnotation' - JUMBOGENOTYPEANNOTATION = 'JumboGenotypeAnnotation' - JUMBOINFOANNOTATION = 'JumboInfoAnnotation' - REDUCIBLEANNOTATION = 'ReducibleAnnotation' - STANDARDANNOTATION = 'StandardAnnotation' - STANDARDFLOWBASEDANNOTATION = 'StandardFlowBasedAnnotation' - STANDARDHCANNOTATION = 'StandardHCAnnotation' - STANDARDMUTECTANNOTATION = 'StandardMutectAnnotation' - VARIANTANNOTATION = 'VariantAnnotation' - + AS_STANDARDANNOTATION = "AS_StandardAnnotation" + ALLELESPECIFICANNOTATION = "AlleleSpecificAnnotation" + GENOTYPEANNOTATION = "GenotypeAnnotation" + INFOFIELDANNOTATION = "InfoFieldAnnotation" + JUMBOGENOTYPEANNOTATION = "JumboGenotypeAnnotation" + JUMBOINFOANNOTATION = "JumboInfoAnnotation" + REDUCIBLEANNOTATION = "ReducibleAnnotation" + STANDARDANNOTATION = "StandardAnnotation" + STANDARDFLOWBASEDANNOTATION = "StandardFlowBasedAnnotation" + STANDARDHCANNOTATION = "StandardHCAnnotation" + STANDARDMUTECTANNOTATION = "StandardMutectAnnotation" + VARIANTANNOTATION = "VariantAnnotation" class AnnotationExclude(StrEnum): - AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' - BASEQUALITY = 'BaseQuality' - COVERAGE = 'Coverage' - DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' - DEPTHPERSAMPLEHC = 'DepthPerSampleHC' - FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' - FRAGMENTLENGTH = 'FragmentLength' - MAPPINGQUALITY = 'MappingQuality' - ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' - READPOSITION = 'ReadPosition' - STRANDBIASBYSAMPLE = 'StrandBiasBySample' - TANDEMREPEAT = 'TandemRepeat' - + AS_STRANDBIASMUTECTANNOTATION = "AS_StrandBiasMutectAnnotation" + BASEQUALITY = "BaseQuality" + COVERAGE = "Coverage" + DEPTHPERALLELEBYSAMPLE = "DepthPerAlleleBySample" + DEPTHPERSAMPLEHC = "DepthPerSampleHC" + FRAGMENTDEPTHPERALLELEBYSAMPLE = "FragmentDepthPerAlleleBySample" + FRAGMENTLENGTH = "FragmentLength" + MAPPINGQUALITY = "MappingQuality" + ORIENTATIONBIASREADCOUNTS = "OrientationBiasReadCounts" + READPOSITION = "ReadPosition" + STRANDBIASBYSAMPLE = "StrandBiasBySample" + TANDEMREPEAT = "TandemRepeat" class DisableReadFilter(StrEnum): - GOODCIGARREADFILTER = 'GoodCigarReadFilter' - MAPPEDREADFILTER = 'MappedReadFilter' - MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' - MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' - MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' - NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' - NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' - NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' - NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' - PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' - READLENGTHREADFILTER = 'ReadLengthReadFilter' - WELLFORMEDREADFILTER = 'WellformedReadFilter' - + GOODCIGARREADFILTER = "GoodCigarReadFilter" + MAPPEDREADFILTER = "MappedReadFilter" + MAPPINGQUALITYAVAILABLEREADFILTER = "MappingQualityAvailableReadFilter" + MAPPINGQUALITYNOTZEROREADFILTER = "MappingQualityNotZeroReadFilter" + MAPPINGQUALITYREADFILTER = "MappingQualityReadFilter" + NONCHIMERICORIGINALALIGNMENTREADFILTER = "NonChimericOriginalAlignmentReadFilter" + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = "NonZeroReferenceLengthAlignmentReadFilter" + NOTDUPLICATEREADFILTER = "NotDuplicateReadFilter" + NOTSECONDARYALIGNMENTREADFILTER = "NotSecondaryAlignmentReadFilter" + PASSESVENDORQUALITYCHECKREADFILTER = "PassesVendorQualityCheckReadFilter" + READLENGTHREADFILTER = "ReadLengthReadFilter" + WELLFORMEDREADFILTER = "WellformedReadFilter" class IntervalMergingRule(StrEnum): - ALL = 'ALL' - OVERLAPPING_ONLY = 'OVERLAPPING_ONLY' - + ALL = "ALL" + OVERLAPPING_ONLY = "OVERLAPPING_ONLY" class IntervalSetRule(StrEnum): - INTERSECTION = 'INTERSECTION' - UNION = 'UNION' - + INTERSECTION = "INTERSECTION" + UNION = "UNION" class ReadFilter(StrEnum): - ALIGNMENTAGREESWITHHEADERREADFILTER = 'AlignmentAgreesWithHeaderReadFilter' - ALLOWALLREADSREADFILTER = 'AllowAllReadsReadFilter' - AMBIGUOUSBASEREADFILTER = 'AmbiguousBaseReadFilter' - CIGARCONTAINSNONOPERATOR = 'CigarContainsNoNOperator' - EXCESSIVEENDCLIPPEDREADFILTER = 'ExcessiveEndClippedReadFilter' - FIRSTOFPAIRREADFILTER = 'FirstOfPairReadFilter' - FLOWBASEDTPATTRIBUTESYMETRICREADFILTER = 'FlowBasedTPAttributeSymetricReadFilter' - FLOWBASEDTPATTRIBUTEVALIDREADFILTER = 'FlowBasedTPAttributeValidReadFilter' - FRAGMENTLENGTHREADFILTER = 'FragmentLengthReadFilter' - GOODCIGARREADFILTER = 'GoodCigarReadFilter' - HASREADGROUPREADFILTER = 'HasReadGroupReadFilter' - HMERQUALITYSYMETRICREADFILTER = 'HmerQualitySymetricReadFilter' - INTERVALOVERLAPREADFILTER = 'IntervalOverlapReadFilter' - JEXLEXPRESSIONREADTAGVALUEFILTER = 'JexlExpressionReadTagValueFilter' - LIBRARYREADFILTER = 'LibraryReadFilter' - MAPPEDREADFILTER = 'MappedReadFilter' - MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' - MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' - MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' - MATCHINGBASESANDQUALSREADFILTER = 'MatchingBasesAndQualsReadFilter' - MATEDIFFERENTSTRANDREADFILTER = 'MateDifferentStrandReadFilter' - MATEDISTANTREADFILTER = 'MateDistantReadFilter' - MATEONSAMECONTIGORNOMAPPEDMATEREADFILTER = 'MateOnSameContigOrNoMappedMateReadFilter' - MATEUNMAPPEDANDUNMAPPEDREADFILTER = 'MateUnmappedAndUnmappedReadFilter' - METRICSREADFILTER = 'MetricsReadFilter' - NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' - NONZEROFRAGMENTLENGTHREADFILTER = 'NonZeroFragmentLengthReadFilter' - NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' - NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' - NOTOPTICALDUPLICATEREADFILTER = 'NotOpticalDuplicateReadFilter' - NOTPROPERLYPAIREDREADFILTER = 'NotProperlyPairedReadFilter' - NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' - NOTSUPPLEMENTARYALIGNMENTREADFILTER = 'NotSupplementaryAlignmentReadFilter' - OVERCLIPPEDREADFILTER = 'OverclippedReadFilter' - PAIREDREADFILTER = 'PairedReadFilter' - PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' - PLATFORMREADFILTER = 'PlatformReadFilter' - PLATFORMUNITREADFILTER = 'PlatformUnitReadFilter' - PRIMARYLINEREADFILTER = 'PrimaryLineReadFilter' - PROPERLYPAIREDREADFILTER = 'ProperlyPairedReadFilter' - READGROUPBLACKLISTREADFILTER = 'ReadGroupBlackListReadFilter' - READGROUPHASFLOWORDERREADFILTER = 'ReadGroupHasFlowOrderReadFilter' - READGROUPREADFILTER = 'ReadGroupReadFilter' - READLENGTHEQUALSCIGARLENGTHREADFILTER = 'ReadLengthEqualsCigarLengthReadFilter' - READLENGTHREADFILTER = 'ReadLengthReadFilter' - READNAMEREADFILTER = 'ReadNameReadFilter' - READSTRANDFILTER = 'ReadStrandFilter' - READTAGVALUEFILTER = 'ReadTagValueFilter' - SAMPLEREADFILTER = 'SampleReadFilter' - SECONDOFPAIRREADFILTER = 'SecondOfPairReadFilter' - SEQISSTOREDREADFILTER = 'SeqIsStoredReadFilter' - SOFTCLIPPEDREADFILTER = 'SoftClippedReadFilter' - VALIDALIGNMENTENDREADFILTER = 'ValidAlignmentEndReadFilter' - VALIDALIGNMENTSTARTREADFILTER = 'ValidAlignmentStartReadFilter' - WELLFORMEDFLOWBASEDREADFILTER = 'WellformedFlowBasedReadFilter' - WELLFORMEDREADFILTER = 'WellformedReadFilter' - + ALIGNMENTAGREESWITHHEADERREADFILTER = "AlignmentAgreesWithHeaderReadFilter" + ALLOWALLREADSREADFILTER = "AllowAllReadsReadFilter" + AMBIGUOUSBASEREADFILTER = "AmbiguousBaseReadFilter" + CIGARCONTAINSNONOPERATOR = "CigarContainsNoNOperator" + EXCESSIVEENDCLIPPEDREADFILTER = "ExcessiveEndClippedReadFilter" + FIRSTOFPAIRREADFILTER = "FirstOfPairReadFilter" + FLOWBASEDTPATTRIBUTESYMETRICREADFILTER = "FlowBasedTPAttributeSymetricReadFilter" + FLOWBASEDTPATTRIBUTEVALIDREADFILTER = "FlowBasedTPAttributeValidReadFilter" + FRAGMENTLENGTHREADFILTER = "FragmentLengthReadFilter" + GOODCIGARREADFILTER = "GoodCigarReadFilter" + HASREADGROUPREADFILTER = "HasReadGroupReadFilter" + HMERQUALITYSYMETRICREADFILTER = "HmerQualitySymetricReadFilter" + INTERVALOVERLAPREADFILTER = "IntervalOverlapReadFilter" + JEXLEXPRESSIONREADTAGVALUEFILTER = "JexlExpressionReadTagValueFilter" + LIBRARYREADFILTER = "LibraryReadFilter" + MAPPEDREADFILTER = "MappedReadFilter" + MAPPINGQUALITYAVAILABLEREADFILTER = "MappingQualityAvailableReadFilter" + MAPPINGQUALITYNOTZEROREADFILTER = "MappingQualityNotZeroReadFilter" + MAPPINGQUALITYREADFILTER = "MappingQualityReadFilter" + MATCHINGBASESANDQUALSREADFILTER = "MatchingBasesAndQualsReadFilter" + MATEDIFFERENTSTRANDREADFILTER = "MateDifferentStrandReadFilter" + MATEDISTANTREADFILTER = "MateDistantReadFilter" + MATEONSAMECONTIGORNOMAPPEDMATEREADFILTER = "MateOnSameContigOrNoMappedMateReadFilter" + MATEUNMAPPEDANDUNMAPPEDREADFILTER = "MateUnmappedAndUnmappedReadFilter" + METRICSREADFILTER = "MetricsReadFilter" + NONCHIMERICORIGINALALIGNMENTREADFILTER = "NonChimericOriginalAlignmentReadFilter" + NONZEROFRAGMENTLENGTHREADFILTER = "NonZeroFragmentLengthReadFilter" + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = "NonZeroReferenceLengthAlignmentReadFilter" + NOTDUPLICATEREADFILTER = "NotDuplicateReadFilter" + NOTOPTICALDUPLICATEREADFILTER = "NotOpticalDuplicateReadFilter" + NOTPROPERLYPAIREDREADFILTER = "NotProperlyPairedReadFilter" + NOTSECONDARYALIGNMENTREADFILTER = "NotSecondaryAlignmentReadFilter" + NOTSUPPLEMENTARYALIGNMENTREADFILTER = "NotSupplementaryAlignmentReadFilter" + OVERCLIPPEDREADFILTER = "OverclippedReadFilter" + PAIREDREADFILTER = "PairedReadFilter" + PASSESVENDORQUALITYCHECKREADFILTER = "PassesVendorQualityCheckReadFilter" + PLATFORMREADFILTER = "PlatformReadFilter" + PLATFORMUNITREADFILTER = "PlatformUnitReadFilter" + PRIMARYLINEREADFILTER = "PrimaryLineReadFilter" + PROPERLYPAIREDREADFILTER = "ProperlyPairedReadFilter" + READGROUPBLACKLISTREADFILTER = "ReadGroupBlackListReadFilter" + READGROUPHASFLOWORDERREADFILTER = "ReadGroupHasFlowOrderReadFilter" + READGROUPREADFILTER = "ReadGroupReadFilter" + READLENGTHEQUALSCIGARLENGTHREADFILTER = "ReadLengthEqualsCigarLengthReadFilter" + READLENGTHREADFILTER = "ReadLengthReadFilter" + READNAMEREADFILTER = "ReadNameReadFilter" + READSTRANDFILTER = "ReadStrandFilter" + READTAGVALUEFILTER = "ReadTagValueFilter" + SAMPLEREADFILTER = "SampleReadFilter" + SECONDOFPAIRREADFILTER = "SecondOfPairReadFilter" + SEQISSTOREDREADFILTER = "SeqIsStoredReadFilter" + SOFTCLIPPEDREADFILTER = "SoftClippedReadFilter" + VALIDALIGNMENTENDREADFILTER = "ValidAlignmentEndReadFilter" + VALIDALIGNMENTSTARTREADFILTER = "ValidAlignmentStartReadFilter" + WELLFORMEDFLOWBASEDREADFILTER = "WellformedFlowBasedReadFilter" + WELLFORMEDREADFILTER = "WellformedReadFilter" class ValidationStringency(StrEnum): - LENIENT = 'LENIENT' - SILENT = 'SILENT' - STRICT = 'STRICT' - + LENIENT = "LENIENT" + SILENT = "SILENT" + STRICT = "STRICT" class LogLevel(StrEnum): - DEBUG = 'DEBUG' - ERROR = 'ERROR' - INFO = 'INFO' - WARNING = 'WARNING' - + DEBUG = "DEBUG" + ERROR = "ERROR" + INFO = "INFO" + WARNING = "WARNING" class WriterType(StrEnum): - ALL_POSSIBLE_HAPLOTYPES = 'ALL_POSSIBLE_HAPLOTYPES' - CALLED_HAPLOTYPES = 'CALLED_HAPLOTYPES' - CALLED_HAPLOTYPES_NO_READS = 'CALLED_HAPLOTYPES_NO_READS' - NO_HAPLOTYPES = 'NO_HAPLOTYPES' - + ALL_POSSIBLE_HAPLOTYPES = "ALL_POSSIBLE_HAPLOTYPES" + CALLED_HAPLOTYPES = "CALLED_HAPLOTYPES" + CALLED_HAPLOTYPES_NO_READS = "CALLED_HAPLOTYPES_NO_READS" + NO_HAPLOTYPES = "NO_HAPLOTYPES" class ReferenceConfidenceMode(StrEnum): - BP_RESOLUTION = 'BP_RESOLUTION' - GVCF = 'GVCF' - NONE = 'NONE' - + BP_RESOLUTION = "BP_RESOLUTION" + GVCF = "GVCF" + NONE = "NONE" class FlowMode(StrEnum): - ADVANCED = 'ADVANCED' - NONE = 'NONE' - STANDARD = 'STANDARD' - + ADVANCED = "ADVANCED" + NONE = "NONE" + STANDARD = "STANDARD" class Implementation(StrEnum): - FLOWBASED = 'FlowBased' - FLOWBASEDHMM = 'FlowBasedHMM' - PAIRHMM = 'PairHMM' - + FLOWBASED = "FlowBased" + FLOWBASEDHMM = "FlowBasedHMM" + PAIRHMM = "PairHMM" class PairHMMImplementation(StrEnum): - AVX_LOGLESS_CACHING = 'AVX_LOGLESS_CACHING' - AVX_LOGLESS_CACHING_OMP = 'AVX_LOGLESS_CACHING_OMP' - EXACT = 'EXACT' - FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' - LOGLESS_CACHING = 'LOGLESS_CACHING' - ORIGINAL = 'ORIGINAL' - + AVX_LOGLESS_CACHING = "AVX_LOGLESS_CACHING" + AVX_LOGLESS_CACHING_OMP = "AVX_LOGLESS_CACHING_OMP" + EXACT = "EXACT" + FASTEST_AVAILABLE = "FASTEST_AVAILABLE" + LOGLESS_CACHING = "LOGLESS_CACHING" + ORIGINAL = "ORIGINAL" class PCRErrorModel(StrEnum): - AGGRESSIVE = 'AGGRESSIVE' - CONSERVATIVE = 'CONSERVATIVE' - HOSTILE = 'HOSTILE' - NONE = 'NONE' - + AGGRESSIVE = "AGGRESSIVE" + CONSERVATIVE = "CONSERVATIVE" + HOSTILE = "HOSTILE" + NONE = "NONE" class SmithWatermanImplementation(StrEnum): - AVX_ENABLED = 'AVX_ENABLED' - FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' - JAVA = 'JAVA' - + AVX_ENABLED = "AVX_ENABLED" + FASTEST_AVAILABLE = "FASTEST_AVAILABLE" + JAVA = "JAVA" class Mutect2(SnappyModel): @@ -265,11 +249,6 @@ class Mutect2(SnappyModel): germline_resource: str | None = None # No options for class FeatureInput """Population vcf of germline sequencing containing allele fractions""" - # Arguments that must be set by derived classes (pon & calling) - - # Panel of normals arguments - - # Calling-specific arguments # panel_of_normals: str | None = None # Was class FeatureInput @@ -293,7 +272,6 @@ class Mutect2(SnappyModel): # pair_hmm_results_file: bool = False # Was class GATKPath # """Write exact pairHMM inputs/outputs to for debugging purposes""" - # Optional arguments add_output_vcf_command_line: bool = True @@ -315,7 +293,7 @@ class Mutect2(SnappyModel): callable_depth: int = 10 """Minimum depth to be considered callable for Mutect stats. Does not affect genotyping""" disable_bam_index_caching: bool = False - """If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified. Caching is automatically disabled if there are no intervals specified""" + """If true, don"t cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified. Caching is automatically disabled if there are no intervals specified""" disable_read_filter: list[DisableReadFilter] = [] """Read filters to be disabled before analysis""" dont_use_dragstr_pair_hmm_scores: bool = False @@ -381,7 +359,7 @@ class Mutect2(SnappyModel): read_validation_stringency: ValidationStringency = ValidationStringency.SILENT """Validation stringency for all SAM/BAM/CRAM/SRA files read by this program. The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded""" sites_only_vcf_output: bool = False - """If true, don't emit genotype fields when writing vcf file output""" + """If true, don"t emit genotype fields when writing vcf file output""" tumor_lod_to_emit: float = 3.0 """Log 10 odds threshold to emit variant to VCF""" use_jdk_deflater: bool = False @@ -609,4 +587,3 @@ class Mutect2(SnappyModel): # """BAM sample name of normal(s), if any. May be URL-encoded as output by GetSampleName with -encode argument""" # tumor_sample: str | None = None # """BAM sample name of tumor. May be URL-encoded as output by GetSampleName with -encode argument""" - From 45b09a37dbcc06cefc5c29df2ded44d905aa6620 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 13 Nov 2024 17:37:51 +0100 Subject: [PATCH 13/46] refactor: Moving & renaming the library kit definition --- .../workflows/ngs_mapping/model.py | 27 ++----------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 56f00a7bc..2d3db5724 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -7,6 +7,7 @@ from pydantic import Field, field_validator, model_validator from snappy_pipeline.models import EnumField, SizeString, SnappyModel, SnappyStepModel +from snappy_pipeline.models.library_kit import LibraryKit class DnaMapper(Enum): @@ -49,30 +50,6 @@ class Tools(SnappyModel): """Required if long-read mapper used; otherwise, leave empty.""" -class TargetCoverageReportEntry(SnappyModel): - """ - Mapping from enrichment kit to target region BED file, for either computing per--target - region coverage or selecting targeted exons. - - The following will match both the stock IDT library kit and the ones - with spike-ins seen fromr Yale genomics. The path above would be - mapped to the name "default". - - name: IDT_xGen_V1_0 - pattern: "xGen Exome Research Panel V1\\.0*" - path: "path/to/targets.bed" - """ - - name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] - - pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] - - path: Annotated[str, Field(examples=["path/to/targets.bed"])] - - -class TargetCoverageReport(SnappyModel): - path_target_interval_list_mapping: list[TargetCoverageReportEntry] = [] - - class BamCollectDoc(SnappyModel): enabled: bool = False window_length: Annotated[int, Field(gt=0)] = 1000 @@ -283,7 +260,7 @@ class NgsMapping(SnappyStepModel): path_link_in: str = "" """OPTIONAL Override data set configuration search paths for FASTQ files""" - target_coverage_report: TargetCoverageReport | None = None + target_coverage_report: LibraryKit | None = None """Thresholds for targeted sequencing coverage QC.""" bam_collect_doc: BamCollectDoc = BamCollectDoc() From 01e60c71bbec215cfd8ef3d16d0baea662a4b783 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 25 Nov 2024 15:20:03 +0100 Subject: [PATCH 14/46] feat: somatic cnv calling for wes & wgs [cnvkit tool only] --- snappy_pipeline/models/cnvkit.py | 245 ++- .../workflows/abstract/__init__.py | 6 +- .../workflows/somatic_cnv_calling/__init__.py | 1421 ++++++++++------- .../somatic_cnv_calling/cnvkit.rules | 202 +-- .../workflows/somatic_cnv_calling/model.py | 491 ++---- .../wrappers/cnvkit/access/wrapper.py | 6 +- .../wrappers/cnvkit/antitarget/wrapper.py | 25 +- .../wrappers/cnvkit/autobin/wrapper.py | 6 +- .../wrappers/cnvkit/bintest/environment.yaml | 1 + .../wrappers/cnvkit/bintest/wrapper.py | 32 + .../wrappers/cnvkit/call/wrapper.py | 72 +- .../wrappers/cnvkit/coverage/wrapper.py | 6 +- .../wrappers/cnvkit/fix/wrapper.py | 27 +- .../cnvkit/plot/scatter/environment.yaml | 1 + .../wrappers/cnvkit/plot/scatter/wrapper.py | 56 + .../wrappers/cnvkit/reference/wrapper.py | 27 +- .../report/genemetrics/environment.yaml | 1 + .../cnvkit/report/genemetrics/wrapper.py | 35 + .../cnvkit/report/metrics/environment.yaml | 1 + .../wrappers/cnvkit/report/metrics/wrapper.py | 29 + .../cnvkit/report/segmetrics/environment.yaml | 1 + .../cnvkit/report/segmetrics/wrapper.py | 33 + .../wrappers/cnvkit/segment/wrapper.py | 34 +- .../wrappers/cnvkit/target/wrapper.py | 26 +- tests/snappy_pipeline/workflows/conftest.py | 35 + .../test_workflow_somatic_cnv_calling.py | 580 +++++++ 26 files changed, 2172 insertions(+), 1227 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py create mode 100644 tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 402e567a6..3e9be3f55 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -6,6 +6,61 @@ from snappy_pipeline.models import SnappyModel +# Parameters for each action & those shared between actions +# param_table = { +# "shared": { +# "short_names": bool, +# "drop_low_coverage": bool, +# "male_reference": bool, +# "sample_sex": Enum, +# "zigocity_freq": float, +# "min_variant_depth": int, +# "diploid_parx_genome": str, +# "normal_id": str, +# "sample_id": str, +# "cluster": bool, +# }, +# "access": {"min_gap_size": int, "exclude": list}, +# "antitarget": {"avg_size": int, "min_size": float}, +# "autobin": { +# "method": Enum, +# "bp_per_bin": float, +# "antitarget_max_size": int, +# "antitarget_min_size": int, +# "target_max_size": int, +# "target_min_size": int, +# }, +# "bintest": {"target": bool, "alpha": float}, +# "call": { +# "center": Enum, +# "filter": Enum, +# "method": Enum, +# "center_at": float, +# "purity": float, +# "ploidy": float, +# "thresholds": list, +# }, +# "coverage": {"count": bool, "min_mapq": int}, +# "fix": {"smoothing_window_fraction": float}, +# "genemetrics": {"alpha": float, "threshold": float, "bootstrap": int}, +# "metrics": {}, +# "reference": {"min_cluster_size": int}, +# "segment": { +# "smooth_cbs": bool, +# "method": Enum, +# "threshold": float, +# "drop_outliers": int, +# }, +# "segmetrics": { +# "alpha": float, +# "threshold": float, +# "bootstrap": int, +# "min_probes": int, +# }, +# "target": {"split": bool, "avg_size": float}, +# } + + class SexOrigin(enum.StrEnum): AUTOMATIC = "auto" """Sex determined from the data""" @@ -33,51 +88,72 @@ def ensure_valid_sex_value(self): class SegmentationMethod(enum.StrEnum): - cbs = "cbs" - flasso = "flasso" - haar = "haar" - hmm = "hmm" - hmm_tumor = "hmm-tumor" - hmm_germline = "hmm-germline" - none = "none" + CBS = "cbs" + FLASSO = "flasso" + HAAR = "haar" + HMM = "hmm" + HMM_TUMOR = "hmm-tumor" + HMM_GERMLINE = "hmm-germline" + NONE = "none" class CenterMethod(enum.StrEnum): - mean = "mean" - median = "median" - mode = "mode" - biweight = "biweight" + MEAN = "mean" + MEDIAN = "median" + MODE = "mode" + BIWEIGHT = "biweight" class FilterMethod(enum.StrEnum): - ampdel = "ampdel" - cn = "cn" - ci = "ci" - sem = "sem" + AMPDEL = "ampdel" + CN = "cn" + CI = "ci" + SEM = "sem" class CallingMethod(enum.StrEnum): - threshold = "threshold" - clonal = "clonal" - none = "" + THRESHOLD = "threshold" + CLONAL = "clonal" + NONE = "none" class Access(SnappyModel): exclude: list[str] = [] """Regions accessible to mapping""" - min_gap_size: int = 5000 - """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" + min_gap_size: int | None = None + """ + Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together. + + In WGS mode, the _target_ regions are set to the accessible regions in the genome. + These accessible regions can be provided by the user, or computed by the `access` + module. In the latter case, the optimal bin size is computed by the `autobin` module + unless this value is provided by the user. + `autobin` uses the `wgs` method _only_ if the list of excluded region is empty and if + the `min_gap_size` parameter remains unassigned. If any of these conditions is not met, + or if a files of accessible regions is provided by the user, then then `amplicon` method + is used. + It is recommended to leave the excluded regions empty and not set the `min_gap_size` + parameter for WGS data, unless the accessible regions are much reduced (for example excluding + all intergenic regions, repeats, low complexity, ...) + """ class Target(SnappyModel): - path_baits: str | None = None - """Path to baits file (Agilent Covered), unset for WGS data""" split: bool = True """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float = 800 / 3 - """Average size of split target bins (results are approximate)""" - short_names: bool = False - """Reduce multi-accession bait labels to be short and consistent""" + avg_size: float | None = None + """ + Average size of split target bins (results are approximate). + + When the parameter is left unassigned, the cnvkit default is used for WES data, + and an optimal value is computed for WGS data, if there is data for normal control(s). + """ + short_names: bool = True + """ + Reduce multi-accession bait labels to be short and consistent. + + Only valid when a gff/gtf features file is defined in the static part of the configuration. + """ class Antitarget(SnappyModel): @@ -101,9 +177,13 @@ class Fix(SnappyModel): class Segment(SnappyModel): method: SegmentationMethod = SegmentationMethod.CBS - """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" - threshold: float = 0.0001 - """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" + """Segmentation method, or 'none' for chromosome arm-level averages as segments""" + threshold: float + """ + Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. + + For HMM methods, this is the smoothing window size. + """ drop_outliers: int = 10 """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" smooth_cbs: bool = False @@ -116,21 +196,25 @@ def ensure_smooth_for_cbs_only(self) -> Self: class Call(SnappyModel): - method: CallingMethod | None = None + method: CallingMethod = CallingMethod.THRESHOLD """Calling method.""" thresholds: list[float] = [-1.1, -0.25, 0.2, 0.7] - """Hard thresholds for calling each integer copy number, separated by commas""" - center: CenterMethod = CenterMethod.MEDIAN + """Hard thresholds for calling each integer copy number""" + center: CenterMethod | None = None """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" center_at: float | None = None - """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" + """ + Subtract a constant number from all log2 ratios. For "manual" re-centering. + + When this parameter is set, the centering method should be left empty. + """ filter: FilterMethod | None = None """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" @model_validator(mode="after") - def avoid_center_center_at_conflict(self) -> Self: - if self.center is not None and self.center_at is not None: - raise ValueError("'call' options 'center' and 'center_at' cannot be used together") + def ensure_center_without_center_at(self) -> Self: + if self.center_at is not None and self.center is not None: + raise ValueError("'center' and 'center_at' parameters cannot be used together") return self @@ -157,9 +241,11 @@ class PlotDiagram(Plot): class PlotScatter(Plot): path_range_list: str | None = None - """File listing the chromosomal ranges to display, as BED, interval list or 'chr:start-end' text""" + """File listing the chromosomal ranges to display, as BED, interval list or 'chr:start-end' text (currently not implemented)""" + chromosome: str | None = None + """Name of the chromosome to display (whole genome if empty)""" gene: str | None = None - """Name of gene or genes (comma-separated) to display.""" + """Name of gene or genes (comma-separated) to display (currently not implemented)""" width: int = 1000000 """Width of margin to show around the selected gene(s)""" antitarget_marker: str = "o" @@ -188,6 +274,21 @@ class Report(SnappyModel): enabled: bool = True +class ReportStats(enum.StrEnum): + MEAN = "mean" + MEDIAN = "median" + MODE = "mode" + T_TEST = "t-test" + STDEV = "stdev" + SEM = "sem" + MAD = "mad" + MSE = "mse" + IQR = "iqr" + BIVAR = "bivar" + CI = "ci" + PI = "pi" + + class ReportSegmetrics(Report): alpha: float = 0.05 """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" @@ -195,6 +296,20 @@ class ReportSegmetrics(Report): """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" smooth_bootstrap: bool = False """Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval""" + stats: list[ReportStats] = [ + ReportStats.MEAN, + ReportStats.MEDIAN, + ReportStats.MODE, + ReportStats.T_TEST, + ReportStats.STDEV, + ReportStats.SEM, + ReportStats.MAD, + ReportStats.MSE, + ReportStats.IQR, + ReportStats.BIVAR, + ReportStats.CI, + ReportStats.PI, + ] class ReportGenemetrics(Report): @@ -206,24 +321,33 @@ class ReportGenemetrics(Report): """Copy number change threshold to report a gene gain/loss""" min_probes: int = 3 """Minimum number of covered probes to report a gain/loss""" - - -class Report(enum.StrEnum): - GENEMETRICS = "genemetrics" - SEGMETRICS = "segmetrics" + stats: list[ReportStats] = [ + ReportStats.MEAN, + ReportStats.MEDIAN, + ReportStats.MODE, + ReportStats.T_TEST, + ReportStats.STDEV, + ReportStats.SEM, + ReportStats.MAD, + ReportStats.MSE, + ReportStats.IQR, + ReportStats.BIVAR, + ReportStats.CI, + ReportStats.PI, + ] class CnvkitToReference(SnappyModel): # Substep-secific parameters - access: Access - target: Target - antitarget: Antitarget + access: Access = Access() + target: Target = Target() + antitarget: Antitarget = Antitarget() - coverage: Coverage + coverage: Coverage = Coverage() - metrics: Report - segmetrics: ReportSegmetrics - genemetrics: ReportGenemetrics + metrics: Report = Report() + segmetrics: ReportSegmetrics = ReportSegmetrics() + genemetrics: ReportGenemetrics = ReportGenemetrics() # Generic parameters (used in different substeps & must agree) male_reference: bool = False @@ -236,11 +360,11 @@ class CnvkitToReference(SnappyModel): min_cluster_size: int = 4 """Minimum cluster size to keep in reference profiles.""" - gc: bool = False + gc: bool = True """Skip GC correction.""" - edge: bool = None + edge: bool | None = None """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" - rmask: bool = False + rmask: bool = True """Skip RepeatMasker correction.""" drop_low_coverage: bool = False @@ -258,15 +382,10 @@ def ensure_males_for_reference(self): class Cnvkit(CnvkitToReference): - fix: Fix + fix: Fix = Fix() segment: Segment - call: Call - bintest: Bintest - - diagram: PlotDiagram - scatter: PlotScatter + call: Call = Call() + bintest: Bintest = Bintest() - min_variant_depth: int = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" - zygocity_freq: float = 0.25 - """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + diagram: PlotDiagram = PlotDiagram() + scatter: PlotScatter = PlotScatter() diff --git a/snappy_pipeline/workflows/abstract/__init__.py b/snappy_pipeline/workflows/abstract/__init__.py index 6e1e66367..77a3bd1fd 100644 --- a/snappy_pipeline/workflows/abstract/__init__.py +++ b/snappy_pipeline/workflows/abstract/__init__.py @@ -176,7 +176,7 @@ def _get_resource(wildcards: Wildcards = None, input: InputFiles = None) -> Any: return _get_resource - def get_args(self, action: str) -> Inputs | Callable[[Wildcards], Inputs]: + def get_args(self, action: str) -> Inputs | Callable[[Wildcards, InputFiles], Inputs]: """Return args for the given action of the sub step""" raise NotImplementedError("Called abstract method. Override me!") # pragma: no cover @@ -873,7 +873,9 @@ def register_sub_workflow( ) self.sub_workflows[sub_workflow_name] = self.workflow.globals[sub_workflow_name] - def get_args(self, sub_step: str, action: str) -> Inputs | Callable[[Wildcards], Inputs]: + def get_args( + self, sub_step: str, action: str + ) -> Inputs | Callable[[Wildcards, InputFiles], Inputs]: """Return arguments for action of substep with given wildcards Delegates to the sub step object's get_args function diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 70bd2c447..7397391ef 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -154,11 +154,14 @@ import os import os.path import re -import typing +from copy import deepcopy +from enum import Enum +from typing import Callable, Iterator, Iterable, NamedTuple, Any from biomedsheets.models import BioEntity, BioSample, TestSample, NGSLibrary from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background -from snakemake.io import OutputFiles, Wildcards +from biomedsheets.io_tsv.base import LIBRARY_TYPES, LIBRARY_TO_EXTRACTION, EXTRACTION_TYPE_DNA +from snakemake.io import OutputFiles, Wildcards, InputFiles from snappy_pipeline.utils import dictify from snappy_pipeline.workflows.abstract import ( @@ -169,8 +172,11 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_pipeline.models.cnvkit import SegmentationMethod as CnvkitSegmentationMethod + from .model import SomaticCnvCalling as SomaticCnvCallingConfigModel -from .model import Sex, LibraryKitDefinition, PanelOfNormalsOrigin +from .model import Cnvkit as CnvkitConfig +from .model import Sex, SexOrigin, SexValue, PanelOfNormalsOrigin, PurityOrigin, VariantOrigin __author__ = "Eric Blanc " @@ -204,18 +210,9 @@ class SomaticCnvCallingStepPart(BaseStepPart): def __init__(self, parent: "SomaticCnvCallingWorkflow"): super().__init__(parent) - def _get_sample_sex(self, library_name: str) -> Sex: - if self.config.sex == Sex.MALE or self.config.sex == Sex.FEMALE: - sample_sex = self.config.sex - elif self.config.sex == Sex.SAMPLESHEET and library_name in self.parent.sex: - sample_sex = self.parent.sex[library_name] - else: - sample_sex = Sex.UNKNOWN - return sample_sex - @staticmethod @dictify - def _get_log_file_from_prefix(prefix: str) -> typing.Iterator[typing.Dict[str, str]]: + def _get_log_file_from_prefix(prefix: str) -> Iterator[dict[str, str]]: key_ext = ( ("log", ".log"), ("sh", ".sh"), @@ -236,20 +233,19 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): #: Class available actions actions = ( "access", + "autobin", "target", "antitarget", "coverage", "reference", - "flat_reference_panel", - "flat_reference_wgs", "fix", "segment", "call", "bintest", - "plot/diagram", - "plot/scatter", - "report/metrics", - "report/segmetrics", + "scatter", + "metrics", + "genemetrics", + "segmetrics", ) # Overwrite defaults @@ -258,518 +254,767 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): def __init__(self, parent: SomaticCnvCallingStepPart): super().__init__(parent) - def get_input_files(self, action: str) -> typing.Callable: + self.is_wgs = ( + any([libraryKit is None for libraryKit in self.parent.tumors.keys()]) + and self.name in self.config.tools.wgs + ) + self.is_wes = ( + any([libraryKit is not None for libraryKit in self.parent.tumors.keys()]) + and self.name in self.config.tools.wes + ) + assert not (self.is_wgs and self.is_wes), "WES & WGS are mixed" + + if self.is_wgs or self.is_wes: + assert ( + len(self.parent.tumors) == 1 + ), "Current cnvkit tool implementation can't handle multiple library types or kits" + + self.libraryKit = list(self.parent.tumors.keys())[0] + self.tumors = {x.library.name: x for x in self.parent.tumors[self.libraryKit]} + + self.cfg: CnvkitConfig = self.config.get(self.name) + self.pon_source = ( + self.cfg.panel_of_normals.source if self.cfg.panel_of_normals.enabled else None + ) + + self._set_cnvkit_pipeline_logic() + + self.path_baits = self._get_path_baits() + + if ( + self.cfg.somatic_purity_ploidy_estimate.enabled + and self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.SAMPLESHEET + ): + assert not any( + [x.purity is None for x in self.tumors.values()] + ), "Missing purity value from samplesheet" + + self.base_out = "work/{mapper}.cnvkit/out/cnvkit." + self.base_out_lib = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." + ) + + def _set_cnvkit_pipeline_logic(self): + """ + Creates instance variables to choose path in cnvkit pipeline + + Access: regions accessible for CNV calling (unmasked) + path_access or when missing build from genome reference + optional list of excluded region + + Target: regions of good coverage + From baits (WES) or accessible regions (WGS) + estimate of target size from config or autobin step + + Antitarget: regions of low coverage + antitarget = access - target, only WES, otherwise empty + + Reference: + Flat: based on targets & antitargets only + Cohort: from panel_of_normals step + File: from another cohort or public data (reference + target + antitarget [WES only]) + Paired (panel of normal disabled): reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) + + Therefore, a reference must be created for flat & paired choices (one reference per normal sample in the latter case). + The logic to create the reference is (panel of normal is pon): + - access created if path_access is missing or average target size estimated + - average target size estimated if value not in config and dataset is WGS + - target created always + - antitarget created when dataset is WES + """ + self.paired = not self.cfg.panel_of_normals.enabled + self.build_ref = self.paired or self.pon_source == PanelOfNormalsOrigin.FLAT + self.compute_avg_target_size = ( + self.is_wgs and self.paired and self.cfg.target.avg_size is None + ) + self.create_access = self.build_ref and (not self.cfg.path_access) + self.plain_access = ( + not self.cfg.path_access + and len(self.cfg.access.exclude) == 0 + and self.cfg.access.min_gap_size is None + ) + + self.variants_from_cohort = ( + self.cfg.somatic_variant_calling.enabled + and self.cfg.somatic_variant_calling.source == VariantOrigin.COHORT + ) + self.variants_from_file = ( + self.cfg.somatic_variant_calling.enabled + and self.cfg.somatic_variant_calling.source == VariantOrigin.FILE + ) + + def _get_sample_sex(self, library_name: str | None) -> SexValue | None: + if self.cfg.sample_sex.source == SexOrigin.SAMPLESHEET and library_name: + sample_sex = self.tumors[library_name].sex + elif self.cfg.sample_sex.source == SexOrigin.CONFIG: + sample_sex = self.cfg.sample_sex.default + else: + sample_sex = None + return sample_sex + + def _get_path_baits(self) -> str | None: + if not self.is_wes: + return None + default = None + for item in self.cfg.path_target_interval_list_mapping: + if item.name == self.libraryKit: + return item.path + elif item.name == "__default__": + default = item.path + if default is None: + raise ValueError(f"Missing library kit definition for {self.libraryKit}") + return default + + def get_input_files(self, action: str) -> Callable: """Return input paths input function, dependent on rule""" # Validate action self._validate_action(action) return getattr(self, "_get_input_files_{}".format(action.replace("/", "_"))) - def get_params(self, action: str) -> typing.Callable: + def get_args(self, action: str) -> Callable: """Return parameters input function, dependent on rule""" # Validate action self._validate_action(action) - return getattr(self, "_get_params_{}".format(action.replace("/", "_"))) + return getattr(self, "_get_args_{}".format(action.replace("/", "_"))) - def get_output_files(self, action: str) -> typing.Callable: - """Return input paths input function, dependent on rule""" - # Validate action + @dictify + def get_output_files(self, action: str): + """ + Return output paths, dependent on rule + + It is important to take good care of wildcards, because + when a paired reference is used on WGS without setting the avg target size, + the output of autobin and target are built for the normal library. + So in this case, library_name stands for the normal library, rather than + for the tumor. + """ self._validate_action(action) - f = getattr(self, "_get_output_files_{}".format(action.replace("/", "_"))) - return f() - def get_log_file(self, action: str) -> typing.Dict[str, str]: - """Return log files, dependent on rule""" + base_report_lib = ( + "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}." + ) + + output_files = {} + match action: + case "access": + output_files = {"access": self.base_out + "access.bed"} + case "autobin": + output_files = {"result": self.base_out_lib + "autobin.txt"} + case "target": + if self.compute_avg_target_size and self.paired: + output_files = {"target": self.base_out_lib + "target.bed"} + else: + output_files = {"target": self.base_out + "target.bed"} + case "antitarget": + output_files = {"antitarget": self.base_out + "antitarget.bed"} + case "coverage": + output_files = {"coverage": self.base_out_lib + "{region,(target|antitarget)}.cnn"} + case "reference": + if self.paired: + output_files = {"reference": self.base_out_lib + "reference.cnn"} + else: + output_files = {"reference": self.base_out + "reference.cnn"} + case "fix": + output_files = {"ratios": self.base_out_lib + "cnr"} + case "segment": + output_files = { + "segments": self.base_out_lib + "segments.cns", + "dataframe": self.base_out_lib + "rds", + } + case "call": + output_files = {"calls": self.base_out_lib + "cns"} + case "bintest": + output_files = {"tests": self.base_out_lib + "bintest.cns"} + case "metrics": + output_files = {"report": base_report_lib + "metrics.tsv"} + case "segmetrics": + output_files = {"report": base_report_lib + "segmetrics.tsv"} + case "genemetrics": + output_files = {"report": base_report_lib + "genemetrics.tsv"} + case "scatter": + output_files = { + "plot": "work/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.scatter.{contig_name}.jpeg" + } + + for k, v in output_files.items(): + yield k, v + yield k + "_md5", v + ".md5" + + @dictify + def get_log_file(self, action): + """Return panel of normal files""" # Validate action self._validate_action(action) - base_name = os.path.join("work", f"{{mapper}}.{self.name}.{{library_name}}", "log") - # Access, target & antitarget steps are cohort-wide, the others are library-dependent - if action in ("access",): - prefix = f"work/{self.name}/log/{action}" - elif action in ("target", "antitarget"): - prefix = f"work/{self.name}/log/{action}" + ".{panel_name}" - elif action in ("coverage",): - prefix = os.path.join(base_name, action + ".{region}") + + base_log = "work/{mapper}.cnvkit/log/cnvkit." + base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." + + if action in ("access", "antitarget"): + tpl = base_log + action elif action in ( - "reference", + "autobin", "fix", "segment", "call", "bintest", - "report/metrics", - "report/segmetrics", + "metrics", + "segmetrics", + "genemetrics", ): - prefix = os.path.join(base_name, action.replace("/", "_")) - elif action in ("plot/diagram", "plot/scatter"): - prefix = os.path.join(base_name, action.replace("/", "_") + ".{contig_name}") - elif action == "flat_reference_panel": - prefix = f"work/{{mapper}}.{self.name}/log/reference.{{panel_name}}" - elif action == "flat_reference_wgs": - prefix = f"work/{{mapper}}.{self.name}/log/reference" - return SomaticCnvCallingStepPart._get_log_file_from_prefix(prefix) - - def get_result_files(self, library_name: str, mapper: str) -> typing.List[str]: + tpl = base_log_lib + action + elif action == "target": + if self.compute_avg_target_size and self.paired: + tpl = base_log_lib + "target" + else: + tpl = base_log + "target" + elif action == "reference": + if self.paired: + tpl = base_log_lib + "reference" + else: + tpl = base_log + "reference" + elif action == "coverage": + tpl = base_log_lib + "{region,(target|antitarget)}coverage" + elif action in ("scatter",): + tpl = base_log_lib + action + ".{contig_name}" + else: + raise ValueError(f"Logs of action '{action}' not implemented yet") + + for key, ext in ( + ("conda_list", ".conda_list.txt"), + ("conda_info", ".conda_info.txt"), + ("log", ".log"), + ("sh", ".sh"), + ): + yield key, tpl + ext + yield key + "_md5", tpl + ext + ".md5" + + def get_result_files(self, library_name: str, mapper: str) -> list[str]: """Files to symlink to output""" - base_name = f"{mapper}.{self.name}.{library_name}" - result_files = [] - # Tumor samples - if library_name in self.parent.normal_library: - # Main results - prefix = os.path.join("output", base_name, "out", base_name) - for suffix in ("cnr", "segments.cns", "cns", "bintest.cnr"): - result_files.append(prefix + "." + suffix) - # Log files - prefix = os.path.join("output", base_name, "log") - for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - result_files.append(os.path.join(prefix, f"coverage.target.{ext}")) - result_files.append(os.path.join(prefix, f"coverage.antitarget.{ext}")) - for suffix in ("fix", "segment", "call", "bintest"): - for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - result_files.append(prefix + "/" + suffix + "." + ext) - # Log of reference is no panel of normals - # if not self.config[self.name]["panel_of_normals"]["enabled"]: - # normal_library = self.parent.normal_library[library_name] - # prefix = os.path.join("output", f"{mapper}.{self.name}.{normal_library}", "log", f"{mapper}.{self.name}.{normal_library}.reference") - # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - # result_files.append(prefix + "." + ext) - # Reports - if "reports" in self.config[self.name]: - prefix = os.path.join("output", base_name, "report", base_name) - for report in ("metrics", "segmetrics"): - if report in self.config[self.name]["reports"]: - result_files.append(prefix + "." + report + ".tsv") - # Plots (per chromosome) - if "plots" in self.config[self.name]: - prefix = os.path.join("output", base_name, "plot") - for plot in ("diagram", "scatter"): - if plot in self.config[self.name]["plots"]: - for contig in self.parent.contigs: - result_files.append(os.path.join(prefix, plot, contig + ".png")) - # else: # Normal samples - # prefix = os.path.join("output", base_name, "log", "reference") - # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - # result_files.append(prefix + "." + ext) - return result_files + base_out_lib = ( + "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." + ).format(mapper=mapper, library_name=library_name) - # ----- Access -------------------------------------------------------------------------------- + base_report_lib = ( + "output/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}." + ).format(mapper=mapper, library_name=library_name) - def _get_input_files_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return None + base_plot_lib = ( + "output/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}." + ).format(mapper=mapper, library_name=library_name) - def _get_params_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: - params = {"reference": self.w_config.static_data_config.reference.path} - params["min_gap_size"] = self.config[self.name]["access"]["min_gap_size"] - access = self.config[self.name]["access"].get("exclude", None) - if access: - params["access"] = access + result_files = [] - def _get_output_files_access(self) -> typing.Dict[str, str]: - return {"access": f"work/{self.name}/out/access.bed"} + for suffix in ("cnr", "segments.cns", "cns", "bintest.cns"): + result_files.append(base_out_lib + suffix) - # ----- Target -------------------------------------------------------------------------------- + actions_to_log = ("fix", "segment", "call", "bintest") + for action in actions_to_log: + result_files += [ + path.replace("work", "output", 1).format(mapper=mapper, library_name=library_name) + for path in filter( + lambda p: not p.endswith(".md5"), self.get_log_file(action).values() + ) + ] - def _get_input_files_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: - for panel in self.config.path_target_interval_list_mapping: - if panel.name == wildcards.panel_name: - return {"region": panel.path} + # Logs of metrics not linked + for report in ("metrics", "segmetrics", "genemetrics"): + if self.cfg.get(report).get("enabled"): + result_files.append(base_report_lib + report + ".tsv") - def _get_params_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "split": self.config[self.name]["target"]["split"], - "avg_size": self.config[self.name]["target"]["avg_size"], - } + # Logs of plots not links + # TODO: Mouse date: only chromosomes 1 to 19 + chrs = ["all"] + list(map(str, range(1, 23))) + ["X"] + if ( + self.cfg.sample_sex.source != SexOrigin.CONFIG + or self.cfg.sample_sex.default == SexValue.FEMALE + ): + chrs.append("Y") - def _get_output_files_target(self) -> typing.Dict[str, str]: - return {"region": f"work/{self.name}/out/{{panel_name}}_target.bed"} + for plot in ("scatter",): + if self.cfg.get(plot).get("enabled"): + for chr in chrs: + result_files.append(base_plot_lib + f"{plot}.{chr}.jpeg") - # ----- Antitarget ---------------------------------------------------------------------------- + result_files += [x + ".md5" for x in result_files] + return result_files - def _get_input_files_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: - # No antitarget for WGS - return { - "target": f"work/{self.name}/out/{wildcards.panel_name}_target.bed", - "access": f"work/{self.name}/out/access.bed", - } + # ----- Access -------------------------------------------------------------------------------- - def _get_params_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "avg_size": self.config[self.name]["antitarget"]["avg_size"], - "min_size": self.config[self.name]["antitarget"]["min_size"], + def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "Should not build access, already available" + return {} + + def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + """ + Arguments used to compute accessible regions for mapping + + When accessible regions are needed to compute average target size + (WGS without average target size set in the config) + then accessible region must cover the full genome (except masked). + Otherwise, access is built with excluded regions. + This happens when the average target size is set in the config in WGS, + or for WES. + """ + assert self.create_access, "Should not build access, already available" + return dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "min-gap-size": self.cfg.access.min_gap_size, + "exclude": self.cfg.access.exclude, } - def _get_output_files_antitarget(self) -> typing.Dict[str, str]: - return {"region": f"work/{self.name}/out/{{panel_name}}_antitarget.bed"} + # ----- Autobin (never used directly, only to compute target size in WGS settings) ------------ - # ----- Coverage ------------------------------------------------------------------------------ + def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: + """ + Input files used to get a good estimate of the average target size - def _get_input_files_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: - # BAM/BAI file + This is only used for WGS data when the average target size isn't set in the config. + The access must be computed over the whole genome (no exclude files) + """ + assert wildcards["library_name"] not in self.tumors, "Autobin always computed on normals" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files = { - "bam": ngs_mapping(base_path + ".bam"), - "bai": ngs_mapping(base_path + ".bam.bai"), - } + tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam".format(**wildcards) + input_files = {"bams": [ngs_mapping(tpl)]} + if self.create_access: + if self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + return input_files - # Region (target or antitarget) file - panel = self.parent.libraryKit.get(wildcards.library_name, None) - if panel is None: - input_files["region"] = f"work/{self.name}/out/access.bed" + def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert ( + self.compute_avg_target_size + ), "Trying to estimate average target size for non-WGS samples" + args = dict(input) | {"bp-per-bin": 50000} + if self.plain_access: + args["method"] = "wgs" else: - input_files["region"] = f"work/{self.name}/out/{panel.name}_{wildcards.region}.bed" - return input_files + args["method"] = "amplicon" + if "target" not in args: + args["target"] = self.cfg.path_access + return args - def _get_params_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "fasta": self.w_config.static_data_config.reference.path, - "count": self.config[self.name]["coverage"]["count"], - "min_mapq": self.config[self.name]["coverage"]["min_mapq"], - "processes": self.default_resource_usage.threads, - } + # ----- Target -------------------------------------------------------------------------------- - def _get_output_files_coverage(self) -> typing.Dict[str, str]: - return {"coverage": f"work/{{mapper}}.{self.name}.{{library_name}}/out/{{region}}.cnn"} + def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: + """Input files to compute the target regions - # ----- Reference ----------------------------------------------------------------------------- + For WES, no input files, it comes from the baits (in arguments) or + the pon, a previously computed file or the baits (no reference needed) - def _get_input_files_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: - """Builds reference from the paired normal, or flat prior in absence of normal""" + For WGS, target is access, with avg size from the config, or 5000 when + no normal is available (flat prior) or autobin-computed avg size when paired. + In the latter case, the access must be computed from whole genome + (no exclude, no min_avg_size) + """ + assert self.build_ref, "Should not build targets, already available" input_files = {} - normal_library = self.parent.normal_library.get(wildcards.library_name, None) - input_files["normals"] = [ - f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/target.cnn", - f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/antitarget.cnn", - ] + if self.is_wgs: + if self.create_access: + input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" + if self.compute_avg_target_size: + input_files["avg-size"] = self.base_out_lib.format(**wildcards) + "autobin.txt" return input_files - def _get_params_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: - params = { - "fasta": self.w_config.static_data_config.reference.path, - "cluster": self.config[self.name]["reference"]["cluster"], - "min_cluster_size": self.config[self.name]["reference"]["min_cluster_size"], - "male_reference": self.config[self.name]["use_male_reference"], - "no_gc": self.config[self.name]["reference"]["no_gc"], - "no_edge": self.config[self.name]["reference"]["no_edge"], - "no_rmask": self.config[self.name]["reference"]["no_rmask"], - } - sample_sex = self._get_sample_sex(wildcards.library_name) - if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: - params["sample_sex"] = str(sample_sex) - return + def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.build_ref, "Should not build targets, already available" + if self.is_wes: + args = { + "avg-size": self.cfg.target.avg_size, + "split": self.cfg.target.split, + "interval": self.path_baits, + } + else: + assert self.is_wgs, "Panel not implemented yet" + args = dict(input) | {"split": self.cfg.target.split} + if args.get("avg-size", None) is not None: + args["avg-size"] = self._read_autobin_output(args["avg-size"]) + elif self.cfg.target.avg_size is not None: + args["avg-size"] = self.cfg.target.avg_size + else: + args["avg-size"] = 5000 + if self.w_config.static_data_config.get("features", None): + args["annotate"] = self.w_config.static_data_config.features.path + args["short-names"] = self.cfg.target.short_names + return args - def _get_output_files_reference(self) -> typing.Dict[str, str]: - """TODO: flat prior reference should be library-independent""" - return {"reference": f"work/{{mapper}}.{self.name}.{{library_name}}/out/reference.cnn"} + # ----- Antitarget ---------------------------------------------------------------------------- - def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: - """Builds reference from the paired normal, or flat prior in absence of normal""" - input_files = {} - panel = self.parent.libraryKit.get(wildcards.library_name, None) - if panel is None: # WGS, target is access, no antitarget - input_files["target"] = f"work/{self.name}/out/access.bed" - else: # WES, both target & antitarget - input_files["target"] = f"work/{self.name}/out/{panel.name}_target.bed" - input_files["antitarget"] = f"work/{self.name}/out/{panel.name}_antitarget.bed" + def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: + input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} + if self.create_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" return input_files - def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_input_files_flat_reference_panel(wildcards) + def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "avg-size": self.cfg.antitarget.avg_size, + "min-size": self.cfg.antitarget.min_size, + } + if "access" not in args: + args["access"] = self.cfg.path_access + return args - def _get_params_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_params_reference(wildcards) + # ----- Coverage ------------------------------------------------------------------------------ - def _get_output_files_flat_reference_panel(self) -> typing.Dict[str, str]: - """TODO: flat prior reference should be library-independent""" - return {"reference": f"work/{{mapper}}.{self.name}/out/reference.{{panel_name}}.cnn"} + def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: + """ + Compute coverage of region (either target or antitarget) + + Except when region provided with file, the region is computed by the pipeline, + and must be inculded with the inputs (possibly from the panel_of_normals step). + For WGS paired, the target regions are sample-dependent, because the optimal + average target size is sample-dependent (via the rough normal sample coverage). + In that case, the target regions must be taken from the normal sample, to + avoid requesting to build targets from the tumor sample. + """ + # BAM/BAI file + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) + input_files = {"bam": ngs_mapping(base_path + ".bam")} + + # Region (target or antitarget) file + if self.build_ref: + if self.compute_avg_target_size: + tpl = self.base_out_lib + "{region}.bed" + if wildcards["library_name"] in self.tumors: + input_files["intervals"] = tpl.format( + mapper=wildcards["mapper"], + library_name=self.parent.matched_normal[wildcards["library_name"]], + region=wildcards["region"], + ) + else: + input_files["intervals"] = tpl.format(**wildcards) + else: + input_files["intervals"] = self.base_out.format(**wildcards) + "{region}.bed" + elif self.pon_source == PanelOfNormalsOrigin.COHORT: + panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] + base_path = "output/{mapper}.cnvkit/out/cnvkit.{region}.bed" + input_files["intervals"] = panel_of_normals(base_path) - def _get_input_files_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_input_files_flat_reference_panel(wildcards) + return input_files - def _get_params_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_params_reference(wildcards) + def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "min-mapq": self.cfg.coverage.min_mapq, + "count": self.cfg.coverage.count, + } + if "intervals" not in args: + intervals = self.cfg.panel_of_normals.get("path_{region}".format(**wildcards), "") + assert intervals != "", "Missing path to {region}".format(**wildcards) + args["intervals"] = intervals + return args - def _get_output_files_flat_reference_wgs(self) -> typing.Dict[str, str]: - """TODO: flat prior reference should be library-independent""" - return {"reference": f"work/{{mapper}}.{self.name}/out/reference.cnn"} + # ----- Reference (flat or pairwise) ---------------------------------------------------------- - # ----- Fix ----------------------------------------------------------------------------------- + def _get_input_files_reference(self, wildcards: Wildcards) -> dict[str, str]: + """Builds reference from the paired normal, or flat prior in absence of normal""" + assert self.build_ref, "Should not build reference" + input_files = {} + if self.paired: + input_files["normals"] = [self.base_out_lib.format(**wildcards) + "target.cnn"] + if self.is_wes: + input_files["normals"].append( + self.base_out_lib.format(**wildcards) + "antitarget.cnn" + ) + elif self.pon_source == PanelOfNormalsOrigin.FLAT: + input_files["target"] = self.base_out.format(**wildcards) + "target.bed" + if self.is_wes: + input_files["antitarget"] = self.base_out.format(**wildcards) + "antitarget.bed" + return input_files - def _get_input_files_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: - # Coverage on targets - input_files = { - "target": f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/target.cnn" + def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.build_ref, "Should not build reference" + args = dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "cluster": self.cfg.cluster, + "no-gc": not self.cfg.gc, + "no-rmask": not self.cfg.rmask, + "no-edge": not self.cfg.get("edge", self.is_wes), } - # Coverage on antitargets when present (absent for WGS) - panel = self.parent.libraryKit.get(wildcards.library_name, None) - if panel is not None: # WGS - no antitarget - input_files["antitarget"] = ( - f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/antitarget.cnn" - ) - # Get reference from panel of normals if available, otherwise from normal or flat when no normal - if not self.config[self.name]["panel_of_normals"]["enabled"]: # Paired normal or flat - normal_library = self.parent.normal_library.get(wildcards.library_name, None) - if normal_library: - input_files["reference"] = ( - f"work/{{mapper}}.{self.name}.{normal_library}/out/reference.cnn" - ) + if self.cfg.cluster: + args["min-cluster-size"] = self.cfg.min_cluster_size + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + sample_sex = self._get_sample_sex(wildcards.get("library_name", None)) + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + if sample_sex == SexValue.MALE and self.paired: + args["male-reference"] = True else: - if panel: - input_files["reference"] = ( - f"work/{{mapper}}.{self.name}/out/reference.{panel.name}.cnn" - ) - else: - input_files["reference"] = f"work/{{mapper}}.{self.name}/out/reference.cnn" - elif ( - self.config[self.name]["panel_of_normals"]["origin"] - == PanelOfNormalsOrigin.PREVIOUS_STEP - ): # Panel_of_normals step - input_files["reference"] = self.parent._get_panel_of_normals_path(self.name, panel) + args["male-reference"] = self.cfg.male_reference else: - input_files["reference"] = self.config[self.name]["panel_of_normals"][ - "path_panel_of_normals" - ] + args["male-reference"] = self.cfg.male_reference + return args + + # ----- Fix ----------------------------------------------------------------------------------- + + def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: + # Coverage on targets & optionally on antitargets + input_files = {"target": self.base_out_lib.format(**wildcards) + "target.cnn"} + if self.is_wes: + input_files["antitarget"] = self.base_out_lib.format(**wildcards) + "antitarget.cnn" + if self.paired: + tpl = "{mapper}.cnvkit.{normal_library}".format( + mapper=wildcards["mapper"], + normal_library=self.parent.matched_normal[wildcards["library_name"]], + ) + input_files["reference"] = os.path.join("work", tpl, "out", tpl + ".reference.cnn") + elif self.pon_source == PanelOfNormalsOrigin.FLAT: + input_files["reference"] = self.base_out.format(**wildcards) + "reference.cnn" + elif self.pon_source == PanelOfNormalsOrigin.COHORT: + panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] + base_path = "output/{mapper}.cnvkit/out/cnvkit.panel_of_normals.cnn" + input_files["reference"] = panel_of_normals(base_path) return input_files - def _get_params_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "sample_id": wildcards.library_name, - "cluster": self.config[self.name]["fix"]["cluster"], - "no_gc": self.config[self.name]["fix"]["no_gc"], - "no_edge": self.config[self.name]["fix"]["no_edge"], - "no_rmask": self.config[self.name]["fix"]["no_rmask"], + def _get_args_fix(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "cluster": self.cfg.cluster, + "no-gc": not self.cfg.gc, + "no-rmask": not self.cfg.rmask, + "no-edge": not self.cfg.get("edge", self.is_wes), } - - def _get_output_files_fix(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"coverage": os.path.join("work", base_name, "out", base_name + ".cnr")} + args["sample-id"] = wildcards.library_name + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + if "reference" not in args: + args["reference"] = self.cfg.panel_of_normals.path_panel_of_normals + return args + + # ----- Variant-related convenience functions ------------------------------------------------- + + def _variants_from_cohort_input(self) -> str: + variants = self.parent.sub_workflows["somatic_variant_calling_cnvkit"] + tpl = f"{{mapper}}.{self.cfg.somatic_variant_calling.tool}.{{library_name}}" + base_path = os.path.join("output", tpl, "out", tpl + ".vcf.gz") + return variants(base_path) + + def _variants_args(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "min-variant-depth": self.cfg.somatic_variant_calling.min_variant_depth, + "sample-id": wildcards.library_name, + "normal-id": self.parent.matched_normal[wildcards.library_name], + } + if self.cfg.somatic_variant_calling.zygocity_freq is not None: + args["zygicity-freq"] = self.cfg.somatic_variant_calling.zygocity_freq + return args # ----- Segment ------------------------------------------------------------------------------- - def _get_input_files_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_input_files_segment(self, wildcards: Wildcards) -> dict[str, str]: # Coverage - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" - input_files = {"coverage": f"work/{base_name}/out/{base_name}.cnr"} - # Segmentation using SNVs if requested and available (normal must be present) - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + input_files = {"ratios": self.base_out_lib.format(**wildcards) + "cnr"} + # Segmentation using SNVs from cohort + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) return input_files - def _get_params_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Segmentation parameters - params = { - "method": self.config[self.name]["segment"]["method"], - "threshold": self.config[self.name]["segment"]["threshold"], - "drop_low_coverage": self.config[self.name]["segment"]["drop_low_coverage"], - "drop_outliers": self.config[self.name]["segment"]["drop_outliers"], - } - if self.config[self.name]["segment"]["method"] == "cbs": - params["smooth_cbs"] = self.config[self.name]["segment"]["smooth_cbs"] - params["processes"] = self.default_resource_usage.threads - # Normal & tumor sample ids if SNVs - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - params["sample_id"] = wildcards.library_name - params["normal_id"] = self.normal_library[wildcards.library_name] - params["min_variant_depth"] = self.config[self.name]["segment"]["min_variant_depth"] - params["zygocity_freq"] = self.config[self.name]["segment"]["zygocity_freq"] - return params - - def _get_output_files_segment(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return { - "segments": os.path.join("work", base_name, "out", base_name + ".segments.cns"), - "dataframe": os.path.join("work", base_name, "out", "dataframe.rds"), + args = dict(input) | { + "method": self.cfg.segment.method, + "threshold": self.cfg.segment.threshold, + "drop-outliers": self.cfg.segment.drop_outliers, + "drop-low-coverage": self.cfg.drop_low_coverage, } + if self.cfg.segment.method == CnvkitSegmentationMethod.CBS: + args["smooth-cbs"] = self.cfg.segment.smooth_cbs + if self.cfg.somatic_variant_calling.enabled: + args |= self._variants_args(wildcards, input) + if "variants" not in args: + args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + return args # ----- Call ---------------------------------------------------------------------------------- - def _get_input_files_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_input_files_call(self, wildcards: Wildcards) -> dict[str, str]: # Segmentation - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" - input_files = {"segments": f"work/{base_name}/out/{base_name}.segments.cns"} - # SNVs if requested and available (normal must be present) - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" - # Purity from the tool if requested and not from the samplesheet + input_files = {"segments": self.base_out_lib.format(**wildcards) + "segments.cns"} + # Segmentation using SNVs from cohort + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + # Purity from the tool if ( - self.config[self.name]["purity"]["enabled"] and self.config[self.name]["purity"]["tool"] - ): # Need purity, and can use tool to obain it - if ( - self.config[self.name]["purity"]["ignore_samplesheet"] - or wildcards.library_name not in self.parent.purity - ): - # Don't use samplesheet - input_files["purity"] = ( - f"work/{base_name}/out/{wildcards.mapper}.{self.config.purity.tool}.txt" - ) + self.cfg.somatic_purity_ploidy_estimate.enabled + and self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.COHORT + ): + purity = self.parent.sub_workflows["somatic_purity_ploidy_estimate_cnvkit"] + tpl = f"{{mapper}}.{self.cfg.somatic_purity_ploidy_estimate.tool}.{{library_name}}" + base_path = os.path.join("output", tpl, "out", tpl + ".txt") + input_files["purity_file"] = purity(base_path).format(**wildcards) return input_files - def _get_params_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Call parameters - params = { - "method": self.config[self.name]["call"]["method"], - "thresholds": self.config[self.name]["call"]["thresholds"], - "filter": self.config[self.name]["call"]["filter"], - "drop_low_coverage": self.config[self.name]["call"]["drop_low_coverage"], - "male_reference": self.config[self.name]["use_male_reference"], + args = dict(input) | { + "method": self.cfg.call.method, + "thresholds": self.cfg.call.thresholds, + "drop-low-coverage": self.cfg.drop_low_coverage, + "male-reference": self.cfg.male_reference, } - # If center_at defined, use it, otherwise use the center method - center = self.config[self.name]["call"].get("center_at", None) - if center is not None: - params["center_at"] = center + if self.cfg.call.center_at is not None: + args["center-at"] = self.cfg.call.center_at else: - params["center"] = self.config[self.name]["call"].get("center", "None") - # Normal & tumor sample ids if SNVs - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - params["sample_id"] = wildcards.library_name - params["normal_id"] = self.normal_library[wildcards.library_name] + if self.cfg.call.center is not None: + args["center"] = self.cfg.call.center + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + if self.cfg.somatic_variant_calling.enabled: + args |= self._variants_args(wildcards, input) + if "variants" not in args: + args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling # Sample sex if known, otherwise guessed by the tool sample_sex = self._get_sample_sex(wildcards.library_name) - if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: - params["sample_sex"] = sample_sex - # If requested, purity from samplesheet or from default if no tool - if self.config[self.name]["purity"]["enabled"]: - purity = self.parent.purity.get( - wildcards.library_name, self.config.purity.default_purity - ) - if purity is not None and not self.config[self.name]["purity"]["ignore_samplesheet"]: - params["purity"] = purity - if self.config.default_ploidy: - params["ploidy"] = self.config.default_ploidy - return params - - def _get_output_files_call(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"calls": os.path.join("work", base_name, "out", base_name + ".cns")} + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + if sample_sex == SexValue.MALE and self.paired: + args["male-reference"] = True + # If requested, purity from samplesheet or from default + if self.cfg.somatic_purity_ploidy_estimate.enabled: + if args.get("purity_file", None) is not None: + (purity, ploidy) = self._read_purity_ploidy_output(args["purity_file"]) + elif self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.SAMPLESHEET: + purity = self.tumors[wildcards.library_name].purity + ploidy = self.tumors[wildcards.library_name].ploidy + elif self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.CONFIG: + purity = self.cfg.purity.purity + ploidy = self.cfg.purity.ploidy + args["purity"] = purity + args["ploidy"] = ploidy + return args # ----- Bintest ------------------------------------------------------------------------------- - def _get_input_files_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_bintest(self, wildcards: Wildcards) -> dict[str, str]: return { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - def _get_params_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "alpha": self.config[self.name]["bintest"]["alpha"], - "target": self.config[self.name]["bintest"]["target"], + def _get_args_bintest(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | { + "alpha": self.cfg.bintest.alpha, + "target": self.cfg.bintest.target, } - def _get_output_files_bintest(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"coverage": os.path.join("work", base_name, "out", base_name + ".bintest.cnr")} - # ----- Plots -------------------------------------------------------------------------------- - def _get_input_files_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_scatter(self, wildcards: Wildcards) -> dict[str, str]: input_files = { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) return input_files - def _get_params_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "threshold": self.config[self.name]["plots"]["diagram"]["threshold"], - "min_probes": self.config[self.name]["plots"]["diagram"]["min_probes"], - "no_shift_xy": self.config[self.name]["plots"]["diagram"]["no_shift_xy"], - } - - def _get_output_files_plot_diagram(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"figure": os.path.join("work", base_name, "plot", "diagram", "{contig_name}.pdf")} - - def _get_input_files_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" - input_files = { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "antitarget-marker": self.cfg.scatter.antitarget_marker, + "by-bin": self.cfg.scatter.by_bin, + "segment-color": self.cfg.scatter.segment_color, + "trend": self.cfg.scatter.trend, + "fig-size": self.cfg.scatter.fig_size, + "width": self.cfg.scatter.width, } - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" - return input_files - - def _get_params_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: - params = { - "chromosome": wildcards.contig_name, - "antitarget_marker": self.config[self.name]["plots"]["scatter"]["antitarget_marker"], - "by_bin": self.config[self.name]["plots"]["scatter"]["by_bin"], - "segment_color": self.config[self.name]["plots"]["scatter"]["segment_color"], - "trend": self.config[self.name]["plots"]["scatter"]["trend"], - "y_max": self.config[self.name]["plots"]["scatter"]["y_max"], - "y_min": self.config[self.name]["plots"]["scatter"]["y_min"], - "fig_size": self.config[self.name]["plots"]["scatter"]["fig_size"], - "sample_id": wildcards.library_name, - } - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - params["normal_id"] = self.normal_library[wildcards.library_name] - params["min_variant_depth"] = self.config[self.name]["plots"]["scatter"][ - "min_variant_depth" - ] - params["zygocity_freq"] = self.config[self.name]["plots"]["scatter"]["zygocity_freq"] - return params - - def _get_output_files_plot_scatter(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"figure": os.path.join("work", base_name, "plot", "scatter", "{contig_name}.pdf")} + if self.cfg.scatter.y_min is not None: + args["y-min"] = self.cfg.scatter.y_min + if self.cfg.scatter.y_min is not None: + args["y-min"] = self.cfg.scatter.y_min + if wildcards["contig_name"] != "all": + args["chromosome"] = wildcards["contig_name"] + if self.cfg.somatic_variant_calling.enabled: + args |= self._variants_args(wildcards, input) + if "variants" not in args: + args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args["title"] = f"{wildcards['library_name']} - {wildcards['contig_name']}" + return args # ----- Metrics (metrics & segmetrics) -------------------------------------------------------- - def _get_input_files_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_metrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - def _get_params_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return {"drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"]} + def _get_args_metrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | {"drop-low-coverage": self.cfg.drop_low_coverage} - def _get_output_files_report_metrics(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"report": os.path.join("work", base_name, "report", base_name + ".metrics.tsv")} - - def _get_input_files_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_segmetrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + } + + def _get_args_segmetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | { + "drop-low-coverage": self.cfg.drop_low_coverage, + "alpha": self.cfg.segmetrics.alpha, + "bootstrap": self.cfg.segmetrics.bootstrap, + "smooth-bootstrap": self.cfg.segmetrics.smooth_bootstrap, + "stats": self.cfg.segmetrics.stats, } - def _get_params_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_input_files_genemetrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"], - "stats": ( - "mean", - "median", - "mode", - "t-test", - "stdev", - "sem", - "mad", - "mse", - "iqr", - "bivar", - "ci", - "pi", - ), - "alpha": self.config[self.name]["reports"]["alpha"], - "bootstrap": self.config[self.name]["reports"]["bootstrap"], + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - def _get_output_files_report_segmetrics(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"report": os.path.join("work", base_name, "report", base_name + ".segmetrics.tsv")} + def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "drop-low-coverage": self.cfg.drop_low_coverage, + "male-reference": self.cfg.male_reference, + "threshold": self.cfg.genemetrics.threshold, + "min-probes": self.cfg.genemetrics.min_probes, + "alpha": self.cfg.genemetrics.alpha, + "bootstrap": self.cfg.genemetrics.bootstrap, + "stats": [x.replace("t-test", "ttest") for x in self.cfg.genemetrics.stats], + } + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + sample_sex = self._get_sample_sex(wildcards.library_name) + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + if sample_sex == SexValue.MALE and self.paired: + args["male-reference"] = True + return args + + # ----- Read small files to put values in parameters + + def _read_autobin_output(self, filename: str) -> int: + nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" + pattern = re.compile("^Target:[ \t]+" + nb + "[ \t]+" + nb + "$") + with open(filename) as f: + for line in f: + m = pattern.match(line) + if m: + return int(float(m.groups()[4])) + return -1 + + def _read_purity_ploidy_output(self, filename: str) -> tuple[float, float]: + # TODO: Tool-dependent parsing of purity/ploidy file + nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" + pattern = re.compile("^Purity/ploidy:[ \t]+" + nb + "[ \t]+" + nb + "$") + with open(filename) as f: + for line in f: + m = pattern.match(line) + if m: + return (float(m.groups()[1]), float(m.groups()[4])) + return (-1.0, -1.0) + + +class LibraryInfo(NamedTuple): + library: NGSLibrary + donor: str + is_tumor: bool + libraryType: str + libraryKit: str | None + sex: Sex | None + purity: float | None + ploidy: float = 2 class SomaticCnvCallingWorkflow(BaseStep): @@ -800,6 +1045,21 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_model_class=SomaticCnvCallingConfigModel, previous_steps=(NgsMappingWorkflow,), ) + # Collect extra information per library + self.valid_dna_libraries = {} + for sheet in self.shortcut_sheets: + self.valid_dna_libraries |= SomaticCnvCallingWorkflow._get_dna_libraries(sheet) + + # All tumor samples, by libraryKit, with None for WGS + self.tumors = SomaticCnvCallingWorkflow._split_by( + SomaticCnvCallingWorkflow._filter_by( + self.valid_dna_libraries.values(), "is_tumor", lambda x: x + ), + "libraryKit", + ) + + self.matched_normal = self._match_normals() + # Register sub step classes so the sub steps are available self.register_sub_step_classes( ( @@ -812,194 +1072,153 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) # Initialize sub-workflows self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) - self.registered_pons = self._optionally_register_pon() - - # Collect extra information per library - self.normal_library = self._get_normal_library() - self.libraryKit = self._get_panel_information() - self.sex = self._get_sex() - self.purity = self._get_purity() + for subworkflow in ( + "panel_of_normals", + "somatic_variant_calling", + "somatic_purity_ploidy_estimate", + ): + self._optionally_register_subworkflow(subworkflow) def get_result_files(self) -> OutputFiles: fns = [] - for seq_type, tools in self.config.tools: - for library in self._get_libraries(): - if library.extra_infos.get("libraryType", "").lower() != seq_type: - continue - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - bio_sample = test_sample.bio_sample - is_tumor = bio_sample.extra_infos.get("isTumor", True) - if is_tumor: - for tool in tools: - f = self.substep_getattr(tool, "get_result_files") - for mapper in self.w_config.step_config["ngs_mapping"]["tools"]["dna"]: - for fn in f(library.name, mapper): - fns.append(fn) + + for tool in self.config.tools.wgs: + for mapper in self.w_config.step_config["ngs_mapping"].tools.dna: + for library in self.tumors.get(None): + fns += self.sub_steps.get(tool).get_result_files(library.library.name, mapper) + + for tool in self.config.tools.wes: + for mapper in self.w_config.step_config["ngs_mapping"].tools.dna: + for libraryKit in self.tumors.keys(): + if libraryKit is None: + continue + for library in self.tumors.get(libraryKit): + fns += self.sub_steps.get(tool).get_result_files( + library.library.name, mapper + ) + return OutputFiles(fns) - def _get_libraries(self) -> typing.Iterator[NGSLibrary]: - for sheet in self.shortcut_sheets: - for donor in sheet.sheet.bio_entities.values(): - for bio_sample in donor.bio_samples.values(): - for test_sample in bio_sample.test_samples.values(): - for library in test_sample.ngs_libraries.values(): - yield library - - def _get_normal_library(self) -> typing.Dict[str, str]: - normal_for_donor = {} - for library in self._get_libraries(): - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - bio_sample = test_sample.bio_sample - is_tumor = bio_sample.extra_infos.get("isTumor", None) - if is_tumor is None: - raise ValueError(f"Missing 'isTumor' value for library '{library.name}'") - if is_tumor: - continue - donor = bio_sample.bio_entity - if donor.name in normal_for_donor: - raise ValueError(f"Multiple normals for donor '{donor.name}'") - normal_for_donor[donor.name] = library.name - - normal_library = {} - for library in self._get_libraries(): - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - bio_sample = test_sample.bio_sample - donor = bio_sample.bio_entity - if bio_sample.extra_infos.get("isTumor", True): - normal_library[library.name] = normal_for_donor[donor.name] - return normal_library - - def _optionally_register_pon(self) -> typing.Dict[str, str]: - """ - Register all possible combination of panel of normals: - - WGS PON for all configured WGS tools which require/can use it - - WES PON for all configured WES tools which require/can use it, one for each enrichment kit + def _match_normals(self): + normals = SomaticCnvCallingWorkflow._split_by( + SomaticCnvCallingWorkflow._filter_by( + self.valid_dna_libraries.values(), "is_tumor", lambda x: not x + ), + "libraryKit", + ) - Note that there is no need to specify the genome release, - because the panel_of_normals step used here MUST be in the same project, - so it has the same configuration, and only one genome release is allowed per configuration. - """ - registered_pons = list() - for tool in self.config.tools.wgs: - pon_name = f"wgs.{tool}" - if pon_name in registered_pons: - continue - if self.config[tool].get("panel_of_normals", None) and self.config[ - tool - ].panel_of_normals.get("path_panel_of_normals_step", None): + # Pairing between tumor & normals (must share the same libraryKit) + matched_normal = { + sample.library.name: None for samples in self.tumors.values() for sample in samples + } + for libraryKit, samples in self.tumors.items(): + if libraryKit in normals: + normals_by_donor = SomaticCnvCallingWorkflow._split_by(normals[libraryKit], "donor") + for sample in samples: + donor = sample.donor + normal = normals_by_donor.get(donor, []) + assert ( + len(normal) < 2 + ), f"Muliple valid donor samples for tumor library {sample.library.name}" + if normal: + normal_library = normal[0].library + matched_normal[sample.library.name] = normal_library.name + return matched_normal + + def _optionally_register_subworkflow(self, subworkflow): + for tool in set(self.config.tools.wgs + self.config.tools.wes): + assert self.config.get(tool) is not None, f"Requested tool '{tool}' not configured" + cfg = self.config.get(tool) + subworkflow_config = cfg.get(subworkflow) + if ( + subworkflow_config + and subworkflow_config.enabled + and str(subworkflow_config.source) == "cohort" + ): self.register_sub_workflow( - "panel_of_normals", - self.config[tool].panel_of_normals.path_panel_of_normals_step, - pon_name, + subworkflow, + subworkflow_config.get(f"path_{subworkflow}"), + f"{subworkflow}_{tool}", ) - registered_pons.append(pon_name) - for tool in self.config.tools.wes: - for panel in self.config.path_target_interval_list_mapping: - pon_name = f"wes.{tool}.{panel.name}" - if pon_name in registered_pons: - continue - if self.config[tool].get("panel_of_normals", None) and self.config[ - tool - ].panel_of_normals.get("path_panel_of_normals_step", None): - self.register_sub_workflow( - "panel_of_normals", - self.config[tool].panel_of_normals.path_panel_of_normals_step, - pon_name, - ) - registered_pons.append(pon_name) - return registered_pons - def _get_panel_information(self) -> typing.Dict[str, str]: - # Set default panel - default = None - for panel in self.config.path_target_interval_list_mapping: - if panel.name == "__default__": - default = panel - break - - # Extract library pattern (the "libraryKit" column in samplesheet) - # On output: - # - the panel name and panel path if libraryKit is present & known - # - the default panel path if libraryKit is undefined or not found - # - None for WGS - # - ValueError if libraryType is missing or unknown (not WES nor WGS) - libraryKit = {} - for library in self._get_libraries(): - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - - libraryType = library.extra_infos.get("libraryType", None) - if libraryType is None: - raise ValueError(f"Missing library type for library '{library.name}'") - elif libraryType == "WES": - if library.extra_infos.get("libraryKit", None): - for panel in self.config.path_target_interval_list_mapping: - if re.match(panel.pattern, library.extra_infos.get("libraryKit")): - libraryKit[library.name] = panel - break - if library.name not in libraryKit: - libraryKit[library.name] = default + @staticmethod + def _get_dna_libraries(sheet) -> dict[str, LibraryInfo]: + allowed_library_types = [ + k for k, v in LIBRARY_TO_EXTRACTION.items() if v == EXTRACTION_TYPE_DNA + ] + + valid_dna_libraries = {} + for donor in sheet.sheet.bio_entities.values(): + sex: SexValue = donor.extra_infos.get("sex", None) + for bio_sample in donor.bio_samples.values(): + is_tumor = bio_sample.extra_infos.get("isTumor", None) + assert ( + is_tumor is not None + ), f"Missing 'isTumor' value for sample '{donor.name}-{bio_sample.name}'" + if is_tumor: + purity = bio_sample.extra_infos.get("purity", None) + ploidy = bio_sample.extra_infos.get("ploidy", 2) else: - libraryKit[library.name] = default - if libraryKit[library.name] is None: - raise ValueError(f"Undefined panel for library '{library.name}") - elif libraryType == "WGS": - libraryKit[library.name] = None - else: - raise ValueError( - f"Unknown library type '{libraryType}' for library '{library.name}'" - ) + purity = None + ploidy = 2 + for test_sample in bio_sample.test_samples.values(): + if ( + test_sample.extra_infos.get("extractionType", "").upper() + != EXTRACTION_TYPE_DNA + ): + continue + for library in test_sample.ngs_libraries.values(): + assert ( + library.name not in valid_dna_libraries + ), f"Duplicate entry for library {library.name}" + libraryType = library.extra_infos.get("libraryType", None) + assert ( + libraryType is not None + ), f"Missing library type for library '{library.name}'" + if libraryType.upper() not in allowed_library_types: + continue + libraryKit = None + if libraryType.upper() == "WES" or libraryType.upper() == "Panel": + libraryKit = library.extra_infos.get("libraryKit", None) + assert ( + libraryKit is not None + ), f"Missing library kit for library '{library.name}'" + valid_dna_libraries[library.name] = LibraryInfo( + library, + donor.name, + is_tumor, + libraryType, + libraryKit, + purity, + ploidy, + sex, + ) + + return valid_dna_libraries - return libraryKit - - def _get_purity(self) -> typing.Dict[str, str]: - """Returns the purity value from the 'purity' library extra_infos. Missing otherwise""" - purity = {} - for library in self._get_libraries(): - p = library.extra_infos.get("purity", None) - if p: - try: - p = float(p) - if 0 <= p and p <= 1: - purity[library.name] = p - except: - pass - return purity - - def _get_sex(self) -> typing.Dict[str, Sex]: - sex = {} - for library in self._get_libraries(): - donor = library.test_sample.bio_sample.bio_entity - donor_sex = donor.extra_infos.get("sex", None) - if donor_sex == "male": - donor_sex = Sex.MALE - elif donor_sex == "female": - donor_sex = Sex.FEMALE - else: - donor_sex = Sex.UNKNOWN - sex[library.name] = donor_sex - return sex - - def _get_panel_of_normals_path(self, tool: str, panel: LibraryKitDefinition | None) -> str: - pon_path = None - assert self.config[tool]["panel_of_normals"][ - "enabled" - ], f"Panel of normals not enabled for '{tool}'" - assert ( - self.config[tool]["panel_of_normals"]["origin"] == PanelOfNormalsOrigin.PREVIOUS_STEP - ), f"'{tool}' panel of normals not from previous step" - if panel is None: - pon_id = f"wgs.{tool}" - else: - pon_id = f"wes.{tool}.{panel.name}" - assert pon_id in self.registered_pons, f"Requested panel '{pon_id}' not registered" - pon = self.parent.sub_workflows[pon_id] - pon_path = pon(f"output/{{mapper}}.{tool}/out/{panel.name}.ext") - return pon_path + @staticmethod + def _split_by( + valid_dna_libraries: list[LibraryInfo], i: str = "library" + ) -> dict[Any, list[LibraryInfo]]: + split = {} + for entry in valid_dna_libraries: + index = getattr(entry, i) + if isinstance(index, (int, float, complex, bool)): + index = str(index) + if index not in split: + split[index] = [] + split[index].append(entry) + return split + + @staticmethod + def _filter_by( + valid_dna_libraries: list[LibraryInfo], + i: str = "library", + f: Callable[[Any], bool] = lambda x: True, + ) -> list[LibraryInfo]: + filtered = [] + for entry in valid_dna_libraries: + index = getattr(entry, i) + if f(index): + filtered.append(entry) + return filtered diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules index 8ec5d13fe..cfaf75206 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules +++ b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules @@ -1,8 +1,8 @@ -rule somatic_targeted_seq_cnv_calling_cnvkit_access: +rule somatic_cnv_calling_cnvkit_access: output: **wf.get_output_files("cnvkit", "access"), params: - wf.get_params("cnvkit", "access"), + **{"args": wf.get_args("cnvkit", "access")}, log: **wf.get_log_file("cnvkit", "access"), threads: wf.get_resource("cnvkit", "access", "threads") @@ -15,11 +15,30 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_access: wf.wrapper_path("cnvkit/access") -rule somatic_targeted_seq_cnv_calling_cnvkit_target: +rule somatic_cnv_calling_cnvkit_autobin: + input: + unpack(wf.get_input_files("cnvkit", "autobin")), + output: + **wf.get_output_files("cnvkit", "autobin"), + params: + **{"args": wf.get_args("cnvkit", "autobin")}, + log: + **wf.get_log_file("cnvkit", "autobin"), + threads: wf.get_resource("cnvkit", "autobin", "threads") + resources: + time=wf.get_resource("cnvkit", "autobin", "time"), + memory=wf.get_resource("cnvkit", "autobin", "memory"), + partition=wf.get_resource("cnvkit", "autobin", "partition"), + tmpdir=wf.get_resource("cnvkit", "autobin", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/autobin") + + +rule somatic_cnv_calling_cnvkit_target: input: unpack(wf.get_input_files("cnvkit", "target")), params: - wf.get_params("cnvkit", "target"), + **{"args": wf.get_args("cnvkit", "target")}, output: **wf.get_output_files("cnvkit", "target"), log: @@ -34,11 +53,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_target: wf.wrapper_path("cnvkit/target") -rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: +rule somatic_cnv_calling_cnvkit_antitarget: input: unpack(wf.get_input_files("cnvkit", "antitarget")), params: - wf.get_params("cnvkit", "antitarget"), + **{"args": wf.get_args("cnvkit", "antitarget")}, output: **wf.get_output_files("cnvkit", "antitarget"), log: @@ -53,11 +72,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: wf.wrapper_path("cnvkit/antitarget") -rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: +rule somatic_cnv_calling_cnvkit_coverage: input: unpack(wf.get_input_files("cnvkit", "coverage")), params: - wf.get_params("cnvkit", "coverage"), + **{"args": wf.get_args("cnvkit", "coverage")}, output: **wf.get_output_files("cnvkit", "coverage"), log: @@ -72,11 +91,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: wf.wrapper_path("cnvkit/coverage") -rule somatic_targeted_seq_cnv_calling_cnvkit_reference: +rule somatic_cnv_calling_cnvkit_reference: input: unpack(wf.get_input_files("cnvkit", "reference")), params: - wf.get_params("cnvkit", "reference"), + **{"args": wf.get_args("cnvkit", "reference")}, output: **wf.get_output_files("cnvkit", "reference"), log: @@ -91,49 +110,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_reference: wf.wrapper_path("cnvkit/reference") -# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_panel: -# input: -# unpack(wf.get_input_files("cnvkit", "flat_reference_panel")), -# params: -# wf.get_params("cnvkit", "reference"), -# output: -# **wf.get_output_files("cnvkit", "flat_reference_panel"), -# log: -# **wf.get_log_file("cnvkit", "reference"), -# threads: wf.get_resource("cnvkit", "reference", "threads") -# resources: -# time=wf.get_resource("cnvkit", "reference", "time"), -# memory=wf.get_resource("cnvkit", "reference", "memory"), -# partition=wf.get_resource("cnvkit", "reference", "partition"), -# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), -# wrapper: -# wf.wrapper_path("cnvkit/reference") - - -# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_wgs: -# input: -# unpack(wf.get_input_files("cnvkit", "flat_reference_wgs")), -# params: -# wf.get_params("cnvkit", "reference"), -# output: -# **wf.get_output_files("cnvkit", "flat_reference_wgs"), -# log: -# **wf.get_log_file("cnvkit", "reference"), -# threads: wf.get_resource("cnvkit", "reference", "threads") -# resources: -# time=wf.get_resource("cnvkit", "reference", "time"), -# memory=wf.get_resource("cnvkit", "reference", "memory"), -# partition=wf.get_resource("cnvkit", "reference", "partition"), -# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), -# wrapper: -# wf.wrapper_path("cnvkit/reference") - - -rule somatic_targeted_seq_cnv_calling_cnvkit_fix: +rule somatic_cnv_calling_cnvkit_fix: input: unpack(wf.get_input_files("cnvkit", "fix")), params: - wf.get_params("cnvkit", "fix"), + **{"args": wf.get_args("cnvkit", "fix")}, output: **wf.get_output_files("cnvkit", "fix"), log: @@ -148,11 +129,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_fix: wf.wrapper_path("cnvkit/fix") -rule somatic_targeted_seq_cnv_calling_cnvkit_segment: +rule somatic_cnv_calling_cnvkit_segment: input: unpack(wf.get_input_files("cnvkit", "segment")), params: - wf.get_params("cnvkit", "segment"), + **{"args": wf.get_args("cnvkit", "segment")}, output: **wf.get_output_files("cnvkit", "segment"), log: @@ -167,11 +148,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_segment: wf.wrapper_path("cnvkit/segment") -rule somatic_targeted_seq_cnv_calling_cnvkit_call: +rule somatic_cnv_calling_cnvkit_call: input: unpack(wf.get_input_files("cnvkit", "call")), params: - wf.get_params("cnvkit", "call"), + **{"args": wf.get_args("cnvkit", "call")}, output: **wf.get_output_files("cnvkit", "call"), log: @@ -186,11 +167,13 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_call: wf.wrapper_path("cnvkit/call") -rule somatic_targeted_seq_cnv_calling_cnvkit_bintest: +rule somatic_cnv_calling_cnvkit_bintest: + input: + unpack(wf.get_input_files("cnvkit", "bintest")), output: **wf.get_output_files("cnvkit", "bintest"), params: - wf.get_params("cnvkit", "bintest"), + **{"args": wf.get_args("cnvkit", "bintest")}, log: **wf.get_log_file("cnvkit", "bintest"), threads: wf.get_resource("cnvkit", "bintest", "threads") @@ -203,77 +186,96 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_bintest: wf.wrapper_path("cnvkit/bintest") -rule somatic_targeted_seq_cnv_calling_cnvkit_plot_diagram: +# rule somatic_cnv_calling_cnvkit_plot_diagram: +# input: +# unpack(wf.get_input_files("cnvkit", "plot/diagram")), +# params: +# **{"args": wf.get_args("cnvkit", "plot/diagram")}, +# output: +# **wf.get_output_files("cnvkit", "plot/diagram"), +# log: +# **wf.get_log_file("cnvkit", "plot/diagram"), +# threads: wf.get_resource("cnvkit", "plot/diagram", "threads") +# resources: +# time=wf.get_resource("cnvkit", "plot/diagram", "time"), +# memory=wf.get_resource("cnvkit", "plot/diagram", "memory"), +# partition=wf.get_resource("cnvkit", "plot/diagram", "partition"), +# tmpdir=wf.get_resource("cnvkit", "plot/diagram", "tmpdir"), +# wrapper: +# wf.wrapper_path("cnvkit/plot/diagram") +# +# +rule somatic_seq_cnv_calling_cnvkit_plot_scatter: input: - unpack(wf.get_input_files("cnvkit", "plot/diagram")), + unpack(wf.get_input_files("cnvkit", "scatter")), params: - wf.get_params("cnvkit", "plot/diagram"), + **{"args": wf.get_args("cnvkit", "scatter")}, output: - **wf.get_output_files("cnvkit", "plot/diagram"), + **wf.get_output_files("cnvkit", "scatter"), log: - **wf.get_log_file("cnvkit", "plot/diagram"), - threads: wf.get_resource("cnvkit", "plot/diagram", "threads") + **wf.get_log_file("cnvkit", "scatter"), + threads: wf.get_resource("cnvkit", "scatter", "threads") resources: - time=wf.get_resource("cnvkit", "plot/diagram", "time"), - memory=wf.get_resource("cnvkit", "plot/diagram", "memory"), - partition=wf.get_resource("cnvkit", "plot/diagram", "partition"), - tmpdir=wf.get_resource("cnvkit", "plot/diagram", "tmpdir"), + time=wf.get_resource("cnvkit", "scatter", "time"), + memory=wf.get_resource("cnvkit", "scatter", "memory"), + partition=wf.get_resource("cnvkit", "scatter", "partition"), + tmpdir=wf.get_resource("cnvkit", "scatter", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/plot/diagram") + wf.wrapper_path("cnvkit/plot/scatter") -rule somatic_targeted_seq_cnv_calling_cnvkit_plot_scatter: +rule somatic_cnv_calling_cnvkit_report_metrics: input: - unpack(wf.get_input_files("cnvkit", "plot/scatter")), + unpack(wf.get_input_files("cnvkit", "metrics")), params: - wf.get_params("cnvkit", "plot/scatter"), + **{"args": wf.get_args("cnvkit", "metrics")}, output: - **wf.get_output_files("cnvkit", "plot/scatter"), + **wf.get_output_files("cnvkit", "metrics"), log: - **wf.get_log_file("cnvkit", "plot/scatter"), - threads: wf.get_resource("cnvkit", "plot/scatter", "threads") + **wf.get_log_file("cnvkit", "metrics"), + threads: wf.get_resource("cnvkit", "metrics", "threads") resources: - time=wf.get_resource("cnvkit", "plot/scatter", "time"), - memory=wf.get_resource("cnvkit", "plot/scatter", "memory"), - partition=wf.get_resource("cnvkit", "plot/scatter", "partition"), - tmpdir=wf.get_resource("cnvkit", "plot/scatter", "tmpdir"), + time=wf.get_resource("cnvkit", "metrics", "time"), + memory=wf.get_resource("cnvkit", "metrics", "memory"), + partition=wf.get_resource("cnvkit", "metrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "metrics", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/plot/scatter") + wf.wrapper_path("cnvkit/report/metrics") -rule somatic_targeted_seq_cnv_calling_cnvkit_report_metrics: +rule somatic_cnv_calling_cnvkit_report_segmetrics: input: - unpack(wf.get_input_files("cnvkit", "report/metrics")), + unpack(wf.get_input_files("cnvkit", "segmetrics")), params: - wf.get_params("cnvkit", "report/metrics"), + **{"args": wf.get_args("cnvkit", "segmetrics")}, output: - **wf.get_output_files("cnvkit", "report/metrics"), + **wf.get_output_files("cnvkit", "segmetrics"), log: - **wf.get_log_file("cnvkit", "report/metrics"), - threads: wf.get_resource("cnvkit", "report/metrics", "threads") + **wf.get_log_file("cnvkit", "segmetrics"), + threads: wf.get_resource("cnvkit", "segmetrics", "threads") resources: - time=wf.get_resource("cnvkit", "report/metrics", "time"), - memory=wf.get_resource("cnvkit", "report/metrics", "memory"), - partition=wf.get_resource("cnvkit", "report/metrics", "partition"), - tmpdir=wf.get_resource("cnvkit", "report/metrics", "tmpdir"), + time=wf.get_resource("cnvkit", "segmetrics", "time"), + memory=wf.get_resource("cnvkit", "segmetrics", "memory"), + partition=wf.get_resource("cnvkit", "segmetrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "segmetrics", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/report/metrics") + wf.wrapper_path("cnvkit/report/segmetrics") -rule somatic_targeted_seq_cnv_calling_cnvkit_report_segmetrics: +rule somatic_cnv_calling_cnvkit_report_genemetrics: input: - unpack(wf.get_input_files("cnvkit", "report/segmetrics")), + unpack(wf.get_input_files("cnvkit", "genemetrics")), params: - wf.get_params("cnvkit", "report/segmetrics"), + **{"args": wf.get_args("cnvkit", "genemetrics")}, output: - **wf.get_output_files("cnvkit", "report/segmetrics"), + **wf.get_output_files("cnvkit", "genemetrics"), log: - **wf.get_log_file("cnvkit", "report/segmetrics"), - threads: wf.get_resource("cnvkit", "report/segmetrics", "threads") + **wf.get_log_file("cnvkit", "genemetrics"), + threads: wf.get_resource("cnvkit", "genemetrics", "threads") resources: - time=wf.get_resource("cnvkit", "report/segmetrics", "time"), - memory=wf.get_resource("cnvkit", "report/segmetrics", "memory"), - partition=wf.get_resource("cnvkit", "report/segmetrics", "partition"), - tmpdir=wf.get_resource("cnvkit", "report/segmetrics", "tmpdir"), + time=wf.get_resource("cnvkit", "genemetrics", "time"), + memory=wf.get_resource("cnvkit", "genemetrics", "memory"), + partition=wf.get_resource("cnvkit", "genemetrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "genemetrics", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/report/segmetrics") + wf.wrapper_path("cnvkit/report/genemetrics") diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py index ef10a9983..f86472ba4 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -1,10 +1,11 @@ import enum import typing from typing import Annotated - from pydantic import Field, model_validator # , validator -from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, Parallel +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel +from snappy_pipeline.models.cnvkit import Cnvkit as CnvkitGeneric +from snappy_pipeline.models.library_kit import LibraryKitEntry class WgsCaller(enum.StrEnum): @@ -19,117 +20,86 @@ class WesCaller(enum.StrEnum): class Tools(SnappyModel): - wgs: Annotated[typing.List[WgsCaller], EnumField(WgsCaller, [])] + wgs: Annotated[list[WgsCaller], EnumField(WgsCaller, [])] """WGS calling tools""" - wes: Annotated[typing.List[WesCaller], EnumField(WesCaller, [])] + wes: Annotated[list[WesCaller], EnumField(WesCaller, [])] """WES calling tools""" -class Sex(enum.StrEnum): - SAMPLESHEET = "samplesheet" - """Obtain the sex from the samplesheet""" - DIPLOID_ONLY = "diploid_only" - """Compute CNV for diploid chromosomes only""" - AUTO = "auto" - """Automatic sex detection using X/Y coverage""" - FEMALE = "female" - """Assume all samples are female""" - MALE = "male" - """Assume all samples are male""" - UNKNOWN = "unknown" - """Sex is unknown""" - - class SequencingMethod(enum.StrEnum): WES = "hybrid" PANEL = "amplicon" WGS = "wgs" -class LibraryKitDefinition(SnappyModel): - """ - Mapping from enrichment kit to target region BED file, for either computing per--target - region coverage or selecting targeted exons. - - The following will match both the stock IDT library kit and the ones - with spike-ins seen fromr Yale genomics. The path above would be - mapped to the name "default". - - name: IDT_xGen_V1_0 - pattern: "xGen Exome Research Panel V1\\.0*" - path: "path/to/targets.bed" - """ +class SexValue(enum.StrEnum): + MALE = "male" + FEMALE = "female" - name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] - pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] +class SexOrigin(enum.StrEnum): + AUTOMATIC = "auto" + SAMPLESHEET = "samplesheet" + CONFIG = "config" - path: Annotated[str, Field(examples=["path/to/targets.bed"])] + +class Sex(SnappyModel): + source: SexOrigin = SexOrigin.AUTOMATIC + default: SexValue | None = None + + @model_validator(mode="after") + def ensure_default_value(self): + if self.source == SexOrigin.CONFIG and not self.default: + raise ValueError("Undefined default sex value in configuration file") + return self class PanelOfNormalsOrigin(enum.StrEnum): - PREVIOUS_STEP = "previous_step" + COHORT = "cohort" """Use (& possibly create) a panel of normals from the current cohort of normals in the panel_of_normals step""" - STATIC = "static" + + FILE = "file" """Use an panel of normals from another cohort or from public data""" + FLAT = "flat" + """Use a flat panel of normal (no panel of normals, actually)""" + class PanelOfNormals(SnappyModel): enabled: bool = False - origin: PanelOfNormalsOrigin = PanelOfNormalsOrigin.PREVIOUS_STEP - path_panel_of_normals: str = "../panel_of_normals" - """ - Path to panel of normals created in current project - - The panel of normals can be either a file (typically from another project), - or from the current project's panel_of_normals step. + """Use panel of normals during CNV calling""" - In the latter case, the missing one(s) (in case there are more than one panel, or if there are WES & WGS) - will be created when not present. - The matching of genome release & exome baits is done on genome name & exome baits md5 checksum. - These are computed in the panel of normals step, and saved with the panel itself. + source: PanelOfNormalsOrigin = PanelOfNormalsOrigin.FILE + """Which type of panel of normals should be used""" - There is no such matching if a panel of normal file is provided. The panel of normals validity is left to the user. + path_panel_of_normals: str = "" """ + Path to panel of normals. + The panel of normals can be either a file (typically from another project, or from the software's own data), + or the path to the pipeline's ```panel_of _normals``` step, depending on the choice of source. -class Mutect2(Parallel): - panel_of_normals: PanelOfNormals | None = None + Note that there is no test that the panel of normals is suitable for that cohort. """ - Panel of normals created by the PanelOfNormals program. - """ - - germline_resource: str - common_variants: str | None = "" - """Common germline variants for contamination estimation""" + @model_validator(mode="after") + def ensure_panel_of_normals_path(self): + if ( + self.enabled + and self.source != PanelOfNormalsOrigin.FLAT + and not self.path_panel_of_normals + ): + raise ValueError("Undefined panel of normal path") + return self - arguments_for_purecn: bool = True - """ - PureCN requires that Mutect2 be called with arguments: - --genotype-germline-sites true --genotype-pon-sites true - """ - extra_arguments: Annotated[ - typing.List[str], - # AfterValidator(argument), - Field( - examples=[ - "--read-filter CigarContainsNoNOperator", - "--annotation AssemblyComplexity BaseQuality", - ] - ), - ] = [] - """ - List additional Mutect2 arguments. - Each additional argument must be of the form: - "-- " - For example, to filter reads prior to calling & to add annotations to the output vcf: - - "--read-filter CigarContainsNoNOperator" - - "--annotation AssemblyComplexity BaseQuality" - """ +class VariantOrigin(enum.StrEnum): + COHORT = "cohort" + """Call somatic variants from the current cohort of normals in the somatic_variant_calling step""" - window_length: int = 300000000 + FILE = "file" + """Use an panel of normals from another cohort or from public data""" class VariantTool(enum.StrEnum): @@ -138,14 +108,33 @@ class VariantTool(enum.StrEnum): class Variant(SnappyModel): enabled: bool = False - tool: VariantTool | None = None + """Use variants (somatic &/or germline) to improve CNV calling""" - mutect2: Mutect2 | None = None + source: VariantOrigin = VariantOrigin.FILE + """Where are the variants obrained from""" + path_somatic_variant_calling: str = "" + """ + Path to the variants to use for CNV calling. -class Ascat(SnappyModel): - pass - """TODO: configure purity tools (except for PureCN)""" + The path can be either to the ```somatic_variant_calling``` step in the pipeline, if "cohort" is selected, + or to the vcf file with the variants when "file" is selected as source. + """ + + tool: VariantTool = VariantTool.MUTECT2 + """Tool used to call somatic variants in the pipeline""" + + @model_validator(mode="after") + def ensure_path_to_variants(self): + if ( + self.enabled + and self.source == VariantOrigin.FILE + and not self.path_somatic_variant_calling + ): + raise ValueError( + "A path to the variant vcf file must be provided when selecting 'file' as source" + ) + return self class Sequenza(SnappyModel): @@ -157,13 +146,24 @@ class ControlFreec(SnappyModel): class PureCn(SnappyModel): - panel_of_normals: PanelOfNormals + panel_of_normals: PanelOfNormals = PanelOfNormals() """ Panel of normals created by the NormalDB.R script. This is required even if the normal/tumor paired mode won't use it. """ - variants: VariantTool + @model_validator(mode="after") + def restrict_pon_mode(self) -> typing.Self: + if not self.panel_of_normals.enabled: + raise ValueError("PureCN requires a panel of normals") + return self + + path_target_interval_list_mapping: list[LibraryKitEntry] = [] + """ + Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect) + """ + + somatic_variant_calling: Variant = Variant() mappability: str = "" """ @@ -202,307 +202,98 @@ class PurityTool(enum.StrEnum): PURECN = "purecn" -class Purity(SnappyModel): - enabled: bool = False +class PurityOrigin(enum.StrEnum): + AUTOMATIC = "auto" + """Use current tool to compute purity & ploidy (PureCn & squenza estimate purity & ploidy)""" - ignore_samplesheet: bool = False - """Discard purity values in samplesheet when they exist""" - default_value: float | None = None - """Purity value for all samples""" + COHORT = "cohort" + """Use external tool from the pipleine to compute purity & ploidy""" - tool: PurityTool | None = None - """Tool used for purity estimation, if not set, try samplesheet, otherwise default_value""" - - ascat: Ascat | None = None + SAMPLESHEET = "samplesheet" + """Extract purity/ploidy from sample sheet""" + CONFIG = "config" + """Extract purity/ploidy from configuration file (all samples have the same value)""" -class CnvkitSegmentationMethod(enum.StrEnum): - CBS = "cbs" - FLASSO = "flasso" - HAAR = "haar" - HMM = "hmm" - HMM_TUMOR = "hmm-tumor" - HMM_GERMLINE = "hmm-germline" - NONE = "none" +class Purity(SnappyModel): + enabled: bool = False + """Use sample purity during CNV calling""" -class CnvkitCallingMethod(enum.StrEnum): - THRESHOLD = "threshold" - CLONAL = "clonal" - NONE = "none" + source: PurityOrigin = PurityOrigin.SAMPLESHEET + path_somatic_purity_ploidy_estimate: str = "../somatic_purity_ploidy_estimate" -class CnvkitCenterMethod(enum.StrEnum): - MEAN = "mean" - MEDIAN = "median" - MODE = "mode" - BIWEIGHT = "biweight" + tool: PurityTool = PurityTool.PURECN + """Tool used for purity estimation, if not set, try samplesheet, otherwise default_value""" + purity: float | None = None + """Default purity estimate""" + ploidy: float = 2.0 + """Default ploidy value""" -class CnvkitFilterMethod(enum.StrEnum): - AMPDEL = "ampdel" - CN = "cn" - CI = "ci" - SEM = "sem" + @model_validator(mode="after") + def ensure_valid_params_for_source(self): + if self.enabled and self.source == PurityOrigin.CONFIG and self.purity is None: + raise ValueError("Missing default purity value") + return self -class CnvkitAccess(SnappyModel): - exclude: Annotated[ - str | None, - Field( - examples=[ - "/fast/work/groups/cubi/projects/biotools/static_data/app_support/cnvkit/access-5k-mappable.grch37.bed" - ] - ), - ] = None - """Regions accessible to mapping""" - - min_gap_size: int = 5000 - """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" - - -class CnvkitTarget(SnappyModel): - split: bool = False - """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float = 800 / 3 - """Average size of split target bins (results are approximate)""" - - -class CnvkitAntitarget(SnappyModel): - avg_size: float = 150000 - """Average size of split antitarget bins (results are approximate)""" - min_size: float | None = None - """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" - - -class CnvkitCoverage(SnappyModel): - count: bool = False - """Get read depths by counting read midpoints within each bin.""" - min_mapq: int = 0 - """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" - - -class CnvkitReference(SnappyModel): - cluster: bool = False - """Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles.""" - min_cluster_size: int = 4 - """Minimum cluster size to keep in reference profiles.""" - no_gc: bool = False - """Skip GC correction.""" - no_edge: bool = None - """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" - no_rmask: bool = False - """Skip RepeatMasker correction.""" - - -class CnvkitFix(SnappyModel): - cluster: bool = False - """Compare and use cluster-specific values present in the reference profile.""" - no_gc: bool = False - """Skip GC correction.""" - no_edge: bool = False - """Skip edge correction.""" - no_rmask: bool = False - """Skip RepeatMasker correction.""" - - -class CnvkitSegment(SnappyModel): - method: CnvkitSegmentationMethod = CnvkitSegmentationMethod.CBS - """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" - threshold: float = 0.0001 - """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - drop_outliers: float = 10 - """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" - smooth_cbs: bool = False - - min_variant_depth: float = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" - zygocity_freq: float = 0.25 - """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" +class PanelOfNormalsCnvkit(PanelOfNormals): + path_targets: str | None = None + """Path to target file (used only when pon is obtained from file, taken from pipeline step otherwise)""" + path_antitargets: str | None = None + """Path to antitarget file (used only when pon is obtained from file, taken from pipeline step otherwise)""" @model_validator(mode="after") - def ensure_smooth_for_cbs_only(self) -> typing.Self: - if self.smooth_cbs and self.method != CnvkitSegmentationMethod.CBS: - raise ValueError("'smooth_cbs' option can be used only with 'CBS' segmentation method") + def ensure_paths_target_antitarget(self): + if self.enabled and self.source == PanelOfNormalsOrigin.FILE: + if self.path_targets is None or self.path_antitargets is None: + raise ValueError( + "When using a previous pon, target & antitarget files must be defined" + ) return self -class CnvkitCall(SnappyModel): - method: CnvkitCallingMethod = CnvkitCallingMethod.THRESHOLD - """Calling method.""" - thresholds: str | None = None - """Hard thresholds for calling each integer copy number, separated by commas""" - center: CnvkitCenterMethod | None = CnvkitCenterMethod.MEDIAN - """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" - center_at: float | None = None - """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" - filter: CnvkitFilterMethod | None = None - """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" - ploidy: float | None = 2 - """Ploidy of the sample cells.""" - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - - min_variant_depth: float = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" - zygocity_freq: float = 0.25 - """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" - - -class CnvkitBintest(SnappyModel): - alpha: float = 0.005 - """Significance threhold.""" - target: bool = False - """Test target bins only; ignore off-target bins.""" - - -class CnvkitPlotDiagram(SnappyModel): - threshold: float = 0.5 - """Copy number change threshold to label genes.""" - min_probes: int = 3 - """Minimum number of covered probes to label a gene.""" - no_shift_xy: bool = False - - -class CnvkitPlotScatter(SnappyModel): - antitarget_marker: str | None = None - """Plot antitargets using this symbol when plotting in a selected chromosomal region.""" - by_bin: bool = False - """Plot data x-coordinates by bin indices instead of genomic coordinates.""" - segment_color: str | None = None - """Plot segment lines in this color. Value can be any string accepted by matplotlib.""" - trend: bool = False - """Draw a smoothed local trendline on the scatter plot.""" - y_max: float | None = None - """y-axis upper limit.""" - y_min: float | None = None - """y-axis lower limit.""" - fig_size: typing.Tuple[float, float] | None = None - """Width and height of the plot in inches.""" - - min_variant_depth: float = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" - zygocity_freq: float = 0.25 +class VariantCnvkit(Variant): + min_variant_depth: int = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" + zygocity_freq: float | None = None """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" -class CnvkitPlot(SnappyModel): - diagram: CnvkitPlotDiagram = CnvkitPlotDiagram() - scatter: CnvkitPlotScatter = CnvkitPlotScatter() - - -class CnvkitReportMetrics(SnappyModel): - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - - -class CnvkitReportSegmetrics(SnappyModel): - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - alpha: float = 0.05 - """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" - bootstrap: int = 100 - """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" - - -class CnvkitReport(enum.StrEnum): - METRICS = "metrics" - SEGMETRICS = "segmetrics" - - -class Cnvkit(SnappyModel): - panel_of_normals: PanelOfNormals | None = None +class Cnvkit(CnvkitGeneric): + panel_of_normals: PanelOfNormalsCnvkit = PanelOfNormalsCnvkit() - variants: VariantTool | None = None - - purity: Purity + path_target_interval_list_mapping: list[LibraryKitEntry] = [] """ - When present, purity estimates can be used for calling segments. The requested tool must be configured. - Or the purity can be provided in the samplesheet, as an extra information attached to the library. - - Note that PureCN cannot be used to estimate purity for WGS samples (because PureCN is WES & Panel-only). - TODO: This should be tested by a validation method, I don't know how to do (Till help!!) - TODO: The exact name is not yet set. + Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect) """ - access: CnvkitAccess = CnvkitAccess() - target: CnvkitTarget = CnvkitTarget() - antitarget: CnvkitAntitarget = CnvkitAntitarget() - coverage: CnvkitCoverage = CnvkitCoverage() + somatic_variant_calling: VariantCnvkit = VariantCnvkit() - reference: CnvkitReference | None = None + somatic_purity_ploidy_estimate: Purity = Purity() @model_validator(mode="after") - def set_default_reference(self) -> typing.Self: - if self.reference is None and not self.panel_of_normals.enabled: - self.reference = CnvkitReference() + def ensure_purity_not_auto(self): + if self.somatic_purity_ploidy_estimate.source == PurityOrigin.AUTOMATIC: + raise ValueError("Cnvkit cannot compute purity/ploidy by itself") return self - fix: CnvkitFix = CnvkitFix() - segment: CnvkitSegment = CnvkitSegment() - call: CnvkitCall = CnvkitCall() - bintest: CnvkitBintest = CnvkitBintest() - - use_male_reference: bool = False - """Create/use a male reference. Must be identical to panel of normals creation, when using one""" - - plots: typing.List[CnvkitPlot] = [] - - reports: typing.List[CnvkitReport] = [] - metrics: CnvkitReportMetrics | None = None + sample_sex: Sex = Sex() - # @validator("metrics") - # def get_default_reference(cls, v, values) -> CnvkitReportMetrics | None: - # if v is None and "metrics" in values["reports"]: - # return CnvkitReportMetrics() - # return None - - segmetrics: CnvkitReportSegmetrics | None = None - - # @validator("segmetrics") - # def get_default_reference(cls, v, values) -> CnvkitReportSegmetrics | None: - # if v is None and "segmetrics" in values["reports"]: - # return CnvkitReportSegmetrics() - # return None + path_access: str | None = None + """Overrides access when not None""" class SomaticCnvCalling(SnappyStepModel): - path_ngs_mapping: str + path_ngs_mapping: str = "../ngs_mapping" """Path to bam files""" tools: Tools """Tools for WGS & WES data""" - path_target_interval_list_mapping: typing.List[LibraryKitDefinition] | None = None - - sex: Sex = Sex.DIPLOID_ONLY - - cnvkit: Cnvkit + cnvkit: Cnvkit | None = None purecn: PureCn | None = None sequenza: Sequenza | None = None control_freec: ControlFreec | None = None - - mutect2: Mutect2 | None = None - - default_ploidy: float | None = None - - # @model_validator(mode="after") - # def ensure_single_pon_step(self) -> typing.Self: - # """ - # I am not sure this is absolutely required. - # I am trying to avoid registering the panel_of_normals step when initializing SomaticCnvCalling - # """ - # pon_steps = set() - # for tool in itertools.chain(self.tools.wgs, self.tools.wes): - # tool_config = getattr(self, tool) - # if ( - # tool_config - # and getattr(tool_config, "use_panel_of_normals") - # and tool_config.use_panel_of_normals == PanelOfNormalsUse.PREVIOUS_STEP - # ): - # pon_steps.add(str(tool_config.panel_of_normals.panel_of_normals)) - # if len(pon_steps) > 1: - # raise ValueError("Too many panel_of_normals steps") - # return self diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index c93c981c5..2c2d8faf8 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -20,13 +20,13 @@ cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ - --min-gap-size {args[min_gap_size]} \ - {exclude} \ + {min_gap_size} {exclude} \ {args[reference]} """.format( snakemake=snakemake, args=args, - exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]) if "exclude" in args else "", + min_gap_size=f"--min-gap-size {args['min-gap-size']}" if args.get("min-gap-size", None) is not None else "", + exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]), ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 596626831..c7009727f 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -17,19 +17,16 @@ args = snakemake.params.get("args", {}) -if snakemake.input.get("target", "") != "": - cmd = r""" - cnvkit.py antitarget \ - -o {snakemake.output.antitarget} \ - --avg-size {args['avg_size']} --min-size {args['min_size']} \ - {access} \ - {snakemake.input.target} - """.format( - snakemake=snakemake, - args=args, - access=f"--access {args['access']}" if "access" in args else "", - ) -else: - cmd = f"touch {snakemake.output.antitarget}" +cmd = r""" +cnvkit.py antitarget \ + -o {snakemake.output.antitarget} \ + --avg-size {args[avg-size]} {min_size} + --access {files[access]} \ + {args[target]} +""".format( + snakemake=snakemake, + args=args, + min_size=f"--min-size {args['min-size']}" if args.get("min-size") is not None else "", +) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index ce913b505..9711475ed 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -21,15 +21,15 @@ cnvkit.py autobin --method {args[method]} \ {out_target} {out_antitarget} \ {access} {target} \ - --bp-per-bin {args[bp_per_bin]} \ - {snakemake.input.bams} \ + --bp-per-bin {args[bp-per-bin]} \ + {args[bams]} \ > {snakemake.output.result} """.format( snakemake=snakemake, args=args, out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", - access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) else "", + access=f"--access {args['access']}" if "access" in args else "", target=f"--targets {args['target']}" if "target" in args else "", ) diff --git a/snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml b/snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py new file mode 100644 index 000000000..0ea46cf30 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py bintest""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py bintest \ + -o {snakemake.output.tests} \ + --segment {args[segments]} \ + --alpha {args[alpha]} {target} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + target=f"--target" if args.get("target", False) else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index c77d8863b..ca0769c0b 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -1,38 +1,32 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py call""" +import os import re +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -class CnvkitWrapperCall(CnvkitWrapper): - PURITY_PATTERN = re.compile("^Purity: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") - PLOIDY_PATTERN = re.compile("^Ploidy: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") - - def preamble(self): - if "purity" in self.snakemake.input: - with open(self.snakemake.input.purity, "rt") as f: - for line in f: - m = CnvkitWrapperCall.PURITY_PATTERN.match(line.strip()) - if m: - self.purity = float(m.groups()[1]) - else: - m = CnvkitWrapperCall.PLOIDY_PATTERN.match(line.strip()) - if m: - self.ploidy = float(m.groups()[1]) - else: - self.purity = self.snakemake.params.purity if "purity" in self.snakemake.params else None - self.ploidy = self.snakemake.params.ploidy if "ploidy" in self.snakemake.params else None - - self.cmd = self.cmd.format(purity=self.purity, ploidy=self.ploidy) - -if "variants" in snakemake.input: +args = snakemake.params.get("args", {}) + +PATTERN = re.compile("^(Purity|Ploidy): +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") + + +if "variants" in args: variants = r""" - ---vcf {snakemake.input.variants} \ - {snakemake.params.sample_id} {snakemake.params.normal_id} \ - {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + ---vcf {args[variants]} \ + --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ + --min-variant-depth {args['min-variant-depth']} {zygocity_freq} """.format( snakemake=snakemake, + args=args, + zygocity_freq=f"--zygocity-freq {args['zygocity-freq']}" if args.get("zygocity-freq", None) is not None else "", ) else: variants = "" @@ -40,21 +34,25 @@ def preamble(self): cmd = r""" cnvkit.py call \ -o {snakemake.output.calls} \ - --method {snakemake.params.method} --thresholds={snakemake.params.thresholds} \ - --filter {snakemake.params.filter} \ - {center} \ - {drop_low_coverage} \ - {sample_sex} {male_reference} \ + --method {args['method']} {thresholds} \ + {filter} \ + {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} \ + {purity} {ploidy} \ {variants} \ - {{purity}} {{ploidy}} \ - {snakemake.input.segments} + {args['segments']} """.format( snakemake=snakemake, - center=f"--center-at {snakemake.params.center_at}" if "center_at" in snakemake.params else f"--center {snakemake.params.center}", - drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", - sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", - male_reference="--male-reference" if snakemake.params.male_reference else "", + args=args, variants=variants, + purity=f"--purity {args['purity']}" if args.get("purity", None) is not None else "", + ploidy=f"--ploidy {args['ploidy']}" if args.get("ploidy", None) is not None else "", + thresholds="--thresholds={}".format(",".join(map(str, args["thresholds"]))) if len(args.get("thresholds", [])) > 0 else "", + filter=f"--filter {args['filter']}" if args.get("filter", None) is not None else "", + center=f"--center {args['center']}" if args.get("center", None) is not None else "", + center_at=f"--center-at {args['center-at']}" if args.get("center-at", None) is not None else "", + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", + male_reference=f"--male-reference" if args.get("male-reference", False) else "", ) -CnvkitWrapperCall(snakemake, cmd).run() +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index a71ef8e8e..cef6277bd 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -20,13 +20,13 @@ cmd = r""" cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {args[reference]} \ + --fasta {args['reference']} \ --min-mapq {args[min_mapq]} {count} \ - {snakemake.input.bam} {snakemake.input.intervals} + {args['bam']} {args['intervals']} """.format( snakemake=snakemake, args=args, - count="--count" if "count" in args else "", + count="--count" if args.get("count", False) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 97387dbcb..554a87222 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -1,24 +1,35 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py fix""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" cnvkit.py fix \ - -o {snakemake.output.coverage} \ - {cluster} {snakemake.params.sample_id} \ + -o {snakemake.output.ratios} \ + {cluster} --sample-id {args['sample-id']} \ {no_gc} {no_edge} {no_rmask} \ - {snakemake.input.target} {antitarget} {snakemake.input.reference} + {args['target']} {antitarget} {args['reference']} """.format( snakemake=snakemake, - cluster="--cluster" if snakemake.params.cluster else "", - no_gc="--no-gc" if snakemake.params.no_gc else "", - no_edge="--no-edge" if snakemake.params.no_edge else "", - no_rmask="--no-rmask" if snakemake.params.no_rmask else "", - antitarget=f"{snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", + cluster="--cluster" if args.get("cluster", False) else "", + no_gc="--no-gc" if args.get("no-gc", False) else "", + no_edge="--no-edge" if args.get("no-edge", False) else "", + no_rmask="--no-rmask" if args.get("no-rmask", False) else "", + antitarget=f"{args['antitarget']}" if "antitarget" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml b/snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py new file mode 100644 index 000000000..a386a2dec --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py scatter""" + +import os +import re +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +if "variants" in args: + variants = r""" + ---vcf {args[variants]} \ + --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ + --min-variant-depth {args['min-variant-depth']} {zygocity_freq} + """.format( + snakemake=snakemake, + args=args, + zygocity_freq=f"--zygocity-freq {args['zygocity-freq']}" if args.get("zygocity-freq", None) is not None else "", + ) +else: + variants = "" + +cmd = r""" +cnvkit.py scatter \ + -o {snakemake.output.plot} \ + --segment {args['segments']} \ + {chromosome} {gene} {range_list} \ + --width {args['width']} \ + --antitarget-marker {args['antitarget-marker']} --segment-color {args['segment-color']} \ + {by_bin} {trend} --title {args['title']} \ + {y_min} {y_max} {fig_size} \ + {variants} \ + {args['ratios']} +""".format( + snakemake=snakemake, + args=args, + variants=variants, + chromosome=f"--chromosome {args['chromosome']}" if args.get("chromosome", None) is not None else "", + gene=f"--gene {args['gene']}" if args.get("gene", None) is not None else "", + range_list=f"--range-list {args['range-list']}" if args.get("range-list", None) is not None else "", + by_bin="--by-bin" if args.get("by-bin", False) else "", + trend="--trend" if args.get("trend", False) else "", + y_min=f"--y-min {args['y-min']}" if args.get("y-min", None) is not None else "", + y_max=f"--y-max {args['y-max']}" if args.get("y-max", None) is not None else "", + fig_size="--fig-size {}".format(" ".join(map(str, args['fig-size']))) if args.get("fig-size", None) is not None else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 82f71ce41..7d843f650 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -17,9 +17,12 @@ args = snakemake.params.get("args", {}) +target = f"--target {args['target']}" if "target" in args else "" +antitarget = f"--antitarget {args['antitarget']}" if "antitarget" in args else "" + cmd = r""" cnvkit.py reference \ - -o {snakemake.output.panel} \ + -o {snakemake.output.reference} \ --fasta {args[reference]} \ {cluster} {min_cluster_size} \ {sample_sex} {male_reference} {diploid_parx_genome} \ @@ -28,17 +31,17 @@ """.format( snakemake=snakemake, args=args, - cluster="--cluster" if "cluster" in args else "", - min_cluster_size=f"--min-cluster-size {args['min_cluster_size']}" if "cluster" in args and "min_cluster_size" in args else "", - no_gc="--no-gc" if "no_gc" in args else "", - no_edge="--no-edge" if "no_edge" in args else "", - no_rmask="--no-rmask" if "no_rmask" in args else "", - sample_sex=f"--sample-sex {args['sample_sex']}" if "sample_sex" in args else "", - male_reference="--male-reference" if "male_reference" in args else "", - diploid_parx_genome=f"--diploid_parx_genome {args['diploid_parx_genome']}" if "diploid_parx_genome" in args else "", - target=f"--target {snakemake.input.target}" if snakemake.input.get("target", None) else "", - antitarget=f"--antitarget {snakemake.input.antitarget}" if snakemake.input.get("antitarget", None) else "", - normals=" ".join(snakemake.input.normals) if snakemake.input.get("normals", None) else "", + target=target, + antitarget=antitarget, + normals=" ".join(args["normals"]) if len(args.get("normals", [])) > 0 else "", + cluster="--cluster" if args.get("cluster", False) else "", + male_reference="--male-reference" if args.get("male-reference", False) else "", + no_gc="--no-gc" if args.get("no-gc", False) else "", + no_edge="--no-edge" if args.get("no-edge", False) else "", + no_rmask="--no-rmask" if args.get("no-rmask", False) else "", + min_cluster_size=f"--min-cluster-size {args['min-cluster-size']}" if "min-cluster-size" in args else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "" ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py new file mode 100644 index 000000000..24a3f5a7e --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py genemetrics""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py genemetrics \ + -o {snakemake.output.report} \ + --segment {args['segments']} \ + --threshold {args['threshold']} --min-probes {args['min-probes']} \ + {drop_low_coverage} {male_reference} {sample_sex} {diploid_parx_genome} \ + {stats} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + male_reference="--male-reference" if args.get("male-reference", False) else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "", + stats=" ".join([f"--{stat}" for stat in args["stats"]]), +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py new file mode 100644 index 000000000..fe421a339 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py genemetrics""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py metrics \ + -o {snakemake.output.report} \ + --segment {args['segments']} \ + {drop_low_coverage} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py new file mode 100644 index 000000000..d569588c2 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py segmetrics""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py segmetrics \ + -o {snakemake.output.report} \ + --segment {args['segments']} \ + --alpha {args['alpha']} --bootstrap {args['bootstrap']} {smooth_bootstrap} \ + {drop_low_coverage} \ + {stats} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + smooth_bootstrap="--smooth-bootstrap" if args.get("smooth-bootstrap", False) else "", + stats=" ".join([f"--{stat}" for stat in args["stats"]]), +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 648e14a2a..2c02aa03c 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -1,31 +1,45 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py segment""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -if "variants" in snakemake.input: +args = snakemake.params.get("args", {}) + +if "variants" in args: variants = r""" - ---vcf {snakemake.input.variants} \ - {snakemake.params.sample_id} {snakemake.params.normal_id} \ - {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + ---vcf {args['variants']} \ + --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ + {args['min-variant-depth']} {zygocity_freq} """.format( snakemake=snakemake, + args=args, + zygocity_freq=f"--zygocity_freq {args['zygocity-freq']}" if "zygocity-freq" in args else "" ) else: variants = "" cmd = r""" -cnvkit.py segment --processes {snakemake.params.proceses} \ +cnvkit.py segment --processes {snakemake.resources._cores} \ -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ - --method {snakemake.params.method} --threshold {snakemake.params.threshold} {smooth_cbs} \ - {drop_low_coverage} --drop-outliers {snakemake.params.drop_outliers} \ + --method {args['method']} --threshold {args['threshold']} {smooth_cbs} \ + {drop_low_coverage} --drop-outliers {args['drop-outliers']} \ {variants} \ - {snakemake.input.coverage} + {args[coverage]} """.format( snakemake=snakemake, - smooth_cbs="--smooth-cbs" if snakemake.params.smooth_cbs else "", - drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", + args=args, variants=variants, + smooth_cbs="--smooth-cbs" if args.get("smooth-cbs", False) else "", + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index fe08248ff..a3f72f3b7 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -18,34 +18,18 @@ args = snakemake.params.get("args", {}) -# WGS: targets are all accessible regions, WES: targets are baits -interval = snakemake.input.access if snakemake.input.get("access", None) else args["target"] - -if snakemake.input.get("avg_size", "") != "": - pattern = re.compile("^Target:[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)$") - with open(snakemake.input.avg_size) as f: - for line in f: - m = pattern.match(line) - if m: - avg_size = int(float(m.groups()[4])) - break -elif "avg_size" in args: - avg_size = args["avg_size"] -else: - avg_size = None - cmd = r""" cnvkit.py target \ -o {snakemake.output.target} \ - {avg_size} {split} {annotate} \ - {interval} + {avg_size} {split} {annotate} {short_names} \ + {args[interval]} """.format( snakemake=snakemake, args=args, - interval=interval, - avg_size=f"--avg-size {avg_size}" if avg_size is not None else "", - split=f"--split" if "split" in args and args["split"] else "", + avg_size=f"--avg-size {args['avg-size']}" if args['avg-size'] is not None else "", + split=f"--split" if args.get("split", False) else "", annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", + short_names="--short-names" if args.get("short-names", False) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/tests/snappy_pipeline/workflows/conftest.py b/tests/snappy_pipeline/workflows/conftest.py index 1a99fed2a..4e7d9de5c 100644 --- a/tests/snappy_pipeline/workflows/conftest.py +++ b/tests/snappy_pipeline/workflows/conftest.py @@ -906,6 +906,41 @@ def cancer_sheet_fake_fs_path_link_in(fake_fs, cancer_sheet_tsv): return fake_fs +@pytest.fixture +def autobin_result_fake_fs(fake_fs, cancer_sheet_tsv): + """Return fake file autobin.txt""" + # Create work directory + fake_fs.fs.makedirs("/work", exist_ok=True) + # Create autobin result for the samples + tpl = "/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt" + for line in cancer_sheet_tsv.splitlines()[8:]: + (donor, sample, isTumor, assay, folder, libraryKit, extract) = line.split("\t") + if isTumor == "N": + library_name = f"{donor}-{sample}-{extract}1-{assay}1" + fake_fs.fs.create_file( + tpl.format(mapper="bwa", library_name=library_name), create_missing_dirs=True + ) + return fake_fs + + +@pytest.fixture +def purity_result_fake_fs(fake_fs, cancer_sheet_tsv): + """Return fake file purity.txt""" + # Create work directory + fake_fs.fs.makedirs("/SOMATIC_PURITY_PLOIDY_ESTIMATE/output", exist_ok=True) + # Create autobin result for the samples + tpl = "/{mapper}.{purity_tool}.{library_name}/out/{mapper}.{purity_tool}.{library_name}.txt" + for line in cancer_sheet_tsv.splitlines()[8:]: + (donor, sample, isTumor, assay, folder, libraryKit, extract) = line.split("\t") + if isTumor == "Y": + library_name = f"{donor}-{sample}-{extract}1-{assay}1" + fake_fs.fs.create_file( + tpl.format(mapper="bwa", purity_tool="ascat", library_name=library_name), + create_missing_dirs=True, + ) + return fake_fs + + @pytest.fixture def aligner_indices_fake_fs(fake_fs): """Return fake file system setup with files for aligner indices""" diff --git a/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py new file mode 100644 index 000000000..de45fb4e1 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py @@ -0,0 +1,580 @@ +# -*- coding: utf-8 -*- +"""Tests for the panel_of_normals workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards + +from snappy_pipeline.workflows.somatic_cnv_calling import SomaticCnvCallingWorkflow + +from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .conftest import patch_module_fs + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for (cancer) configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + features: + path: /path/to/annotations.gtf + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + bwa: + path_index: /path/to/bwa/index.fa + + somatic_variant_calling: + tools: ['mutect2'] + path_ngs_mapping: ../ngs_mapping + mutect2: + common_variants: /path/to/common/variants + + somatic_purity_ploidy_estimate: + tools: ['ascat'] + path_ngs_mapping: ../ngs_mapping + ascat: + b_af_loci: /path/to/locii.bed + + somatic_cnv_calling: + tools: + wgs: ['cnvkit'] + path_ngs_mapping: ../ngs_mapping + cnvkit: + diploid_parx_genome: GRCh38 + panel_of_normals: + enabled: False + somatic_variant_calling: + enabled: True + source: cohort + tool: mutect2 + path_somatic_variant_calling: ../somatic_variant_calling + somatic_purity_ploidy_estimate: + enabled: True + source: cohort + tool: ascat + segment: + threshold: 0.0001 + scatter: + enabled: true + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def somatic_cnv_calling_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + autobin_result_fake_fs, + purity_result_fake_fs, + aligner_indices_fake_fs, + mocker, +): + """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" + # Patch out file-system to enable reading autobin output + autobin_result_fake_fs.fs.create_file( + file_path="work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.autobin.txt", + contents="Target: -1 2000\n", + create_missing_dirs=True, + ) + # Patch out file-system to enable reading autobin output + purity_result_fake_fs.fs.create_file( + file_path="SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", + contents="Purity/ploidy:\t0.35\t2.2\n", + create_missing_dirs=True, + ) + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.somatic_cnv_calling", autobin_result_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variant_calling_cnvkit": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + "panel_of_normals_cnvkit": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + "somatic_purity_ploidy_estimate_cnvkit": lambda x: "SOMATIC_PURITY_PLOIDY_ESTIMATE/" + x, + } + # Construct the workflow object + return SomaticCnvCallingWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_args_access(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "access")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards), + ) + expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": []} + assert actual == expected + + +def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], + "access": "work/bwa.cnvkit/out/cnvkit.access.bed", + "method": "wgs", + "bp-per-bin": 50000, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "autobin")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "autobin")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_target(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "interval": "work/bwa.cnvkit/out/cnvkit.access.bed", + "avg-size": 2000, + "split": True, + "annotate": "/path/to/annotations.gtf", + "short-names": True, + + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "target")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + "region": "target", + } + ) + expected = { + "intervals": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.bed", + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "reference": "/path/to/ref.fa", + "min-mapq": 0, + "count": False, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "coverage")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_reference()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + "region": "target", + } + ) + expected = { + "normals": ["work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn"], + "reference": "/path/to/ref.fa", + "cluster": False, + "no-gc": False, + "no-edge": True, + "no-rmask": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "reference")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "reference")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_fix(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_fix()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "target": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.target.cnn", + "reference": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.reference.cnn", + "cluster": False, + "no-gc": False, + "no-edge": True, + "no-rmask": False, + "diploid-parx-genome": "GRCh38", + "sample-id": "P001-T1-DNA1-WGS1", + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "fix")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_segment(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_segment()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "method": "cbs", + "threshold": 0.0001, + "smooth-cbs": False, + "drop-low-coverage": False, + "drop-outliers": 10, + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "sample-id": "P001-T1-DNA1-WGS1", + "normal-id": "P001-N1-DNA1-WGS1", + "min-variant-depth": 20, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segment")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_call(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_call()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "method": "threshold", + "thresholds": [-1.1, -0.25, 0.2, 0.7], + "drop-low-coverage": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "sample-id": "P001-T1-DNA1-WGS1", + "normal-id": "P001-N1-DNA1-WGS1", + "min-variant-depth": 20, + "purity_file": "SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", + "purity": 0.35, + "ploidy": 2.2, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "call")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_bintest()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "alpha": 0.005, + "target": False, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "bintest")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "bintest")(wildcards), + ) + assert actual == expected + assert actual == expected + + +def test_cnvkit_step_part_get_args_metrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_metrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "drop-low-coverage": False, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "metrics")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "metrics")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_segmetrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_segmetrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "drop-low-coverage": False, + "alpha": 0.05, + "bootstrap": 100, + "smooth-bootstrap": False, + "stats": ["mean", "median", "mode", "t-test", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segmetrics")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "segmetrics")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_genemetric(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_genemetrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "threshold": 0.2, + "min-probes": 3, + "drop-low-coverage": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + "alpha": 0.05, + "bootstrap": 100, + "stats": ["mean", "median", "mode", "ttest", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "genemetrics")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "genemetrics")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_scatter()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + "contig_name": "1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "chromosome": "1", + "width": 1000000, + "antitarget-marker": "o", + "by-bin": False, + "trend": False, + "segment-color": "darkorange", + "title": "P001-T1-DNA1-WGS1 - 1", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "sample-id": "P001-T1-DNA1-WGS1", + "normal-id": "P001-N1-DNA1-WGS1", + "min-variant-depth": 20, + "fig-size": (6.4, 4.8), + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "scatter")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "scatter")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_output_files() for all actions""" + actions = { + "access": {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"}, + "autobin": {"result": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt"}, + "target": {"target": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.target.bed"}, + "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "reference": {"reference": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.reference.cnn"}, + "fix": {"ratios": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr"}, + "segment": { + "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segments.cns", + "dataframe": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.rds", + }, + "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns"}, + "bintest": {"tests": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bintest.cns"}, + "metrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.metrics.tsv"}, + "genemetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.genemetrics.tsv"}, + "segmetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.segmetrics.tsv"}, + "scatter": {"plot": "work/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.scatter.{contig_name}.jpeg"}, + } + for action, result in actions.items(): + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_output_files("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for all actions""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + actions = ("autobin", "target", "reference", "fix", "segment", "call", "bintest", "metrics", "segmetrics", "genemetrics") + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}" + for action in actions: + result = {k: base_log + f".{action}.{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_access(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for access""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit/log/cnvkit.access" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "access") + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for coverage""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}coverage" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_scatter(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for coverage""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.scatter.{contig_name}" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "scatter") + assert actual == expected + + +# SomaticCnvCallingWorkflow -------------------------------------------------------------------------- + + +def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["cnvkit", "link_out"] + actual = list(sorted(somatic_cnv_calling_workflow.sub_steps.keys())) + assert actual == expected + + tumor_libraries = ("P001-T1-DNA1-WGS1", "P002-T1-DNA1-WGS1", "P002-T2-DNA1-WGS1") + + expected = [] + + # cnvkit output files + tpl = "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, ext=ext) + for ext in ("cnr", "segments.cns", "cns", "bintest.cns") + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # cnvkit log files + tpl = "output/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.{step}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, step=step, ext=ext) + for ext in ("conda_info.txt", "conda_list.txt", "log", "sh") + for step in ("fix", "segment", "call", "bintest") + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # cnvkit report files + tpl = "output/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.{step}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, step=step, ext=ext) + for ext in ("tsv",) + for step in ("metrics", "genemetrics", "segmetrics") + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # cnvkit plot files + tpl = "output/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.{step}.{contig_name}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, step=step, contig_name=contig_name, ext=ext) + for ext in ("jpeg",) + for contig_name in ["all"] + list(map(str, range(1, 23))) + ["X", "Y"] + for step in ("scatter",) + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # Add md5 + expected += [x + ".md5" for x in expected] + expected = list(sorted(expected)) + actual = list(sorted(somatic_cnv_calling_workflow.get_result_files())) + assert actual == expected From ba07e4b7d899068cc151fe19dc98cc1a133bba6a Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 27 Nov 2024 14:18:01 +0100 Subject: [PATCH 15/46] feat: somatic cnv calling for cnvkit - complete - partially tested --- snappy_pipeline/models/cnvkit.py | 48 +-- .../models/{library_kit.py => common.py} | 26 +- snappy_pipeline/models/purecn.py | 293 ++++++++++++++++++ .../workflows/somatic_cnv_calling/__init__.py | 259 ++++++++-------- .../workflows/somatic_cnv_calling/model.py | 111 +++---- .../wrappers/cnvkit/call/wrapper.py | 8 +- .../wrappers/cnvkit/coverage/wrapper.py | 6 +- .../wrappers/cnvkit/fix/wrapper.py | 14 +- .../wrappers/cnvkit/plot/scatter/wrapper.py | 16 +- .../cnvkit/report/genemetrics/wrapper.py | 6 +- .../wrappers/cnvkit/report/metrics/wrapper.py | 8 +- .../cnvkit/report/segmetrics/wrapper.py | 6 +- .../wrappers/cnvkit/segment/wrapper.py | 12 +- ... => test_workflows_somatic_cnv_calling.py} | 6 +- 14 files changed, 537 insertions(+), 282 deletions(-) rename snappy_pipeline/models/{library_kit.py => common.py} (62%) create mode 100644 snappy_pipeline/models/purecn.py rename tests/snappy_pipeline/workflows/{test_workflow_somatic_cnv_calling.py => test_workflows_somatic_cnv_calling.py} (99%) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 3e9be3f55..837f8b984 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -61,32 +61,6 @@ # } -class SexOrigin(enum.StrEnum): - AUTOMATIC = "auto" - """Sex determined from the data""" - SAMPLESHEET = "samplesheet" - """Donor sex obtained from sample sheet""" - CONFIG = "config" - """Donor sex obtained from the configuration (all donors have the same sex)""" - - -class SexValue(enum.StrEnum): - MALE = "male" - FEMALE = "female" - - -class Sex(SnappyModel): - source: SexOrigin = SexOrigin.AUTOMATIC - - sample_sex: SexValue | None = None - - @model_validator(mode="after") - def ensure_valid_sex_value(self): - if self.source == SexOrigin.CONFIG and self.sample_sex is None: - raise ValueError("No definition of donors' sex from the configuration") - return self - - class SegmentationMethod(enum.StrEnum): CBS = "cbs" FLASSO = "flasso" @@ -118,24 +92,24 @@ class CallingMethod(enum.StrEnum): class Access(SnappyModel): - exclude: list[str] = [] - """Regions accessible to mapping""" - min_gap_size: int | None = None """ - Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together. - In WGS mode, the _target_ regions are set to the accessible regions in the genome. These accessible regions can be provided by the user, or computed by the `access` module. In the latter case, the optimal bin size is computed by the `autobin` module unless this value is provided by the user. `autobin` uses the `wgs` method _only_ if the list of excluded region is empty and if the `min_gap_size` parameter remains unassigned. If any of these conditions is not met, - or if a files of accessible regions is provided by the user, then then `amplicon` method + or if a files of accessible regions is provided by the user, then the `amplicon` method is used. It is recommended to leave the excluded regions empty and not set the `min_gap_size` parameter for WGS data, unless the accessible regions are much reduced (for example excluding all intergenic regions, repeats, low complexity, ...) """ + + exclude: list[str] = [] + """Regions accessible to mapping""" + min_gap_size: int | None = None + """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" class Target(SnappyModel): @@ -370,16 +344,6 @@ class CnvkitToReference(SnappyModel): drop_low_coverage: bool = False """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - @model_validator(mode="after") - def ensure_males_for_reference(self): - if ( - self.male_reference - and self.sex.source == SexOrigin.CONFIG - and self.sex.sample_sex == SexValue.FEMALE - ): - raise ValueError("Male reference requested for female cohort") - return self - class Cnvkit(CnvkitToReference): fix: Fix = Fix() diff --git a/snappy_pipeline/models/library_kit.py b/snappy_pipeline/models/common.py similarity index 62% rename from snappy_pipeline/models/library_kit.py rename to snappy_pipeline/models/common.py index bd861aa72..714135b65 100644 --- a/snappy_pipeline/models/library_kit.py +++ b/snappy_pipeline/models/common.py @@ -1,5 +1,7 @@ +import enum + from typing import Annotated -from pydantic import Field +from pydantic import Field, model_validator from snappy_pipeline.models import SnappyModel @@ -27,3 +29,25 @@ class LibraryKitEntry(SnappyModel): class LibraryKit(SnappyModel): path_target_interval_list_mapping: list[LibraryKitEntry] = [] """Connects sample-based library kit in sample sheets with corresponding bed files""" + + +class SexValue(enum.StrEnum): + MALE = "male" + FEMALE = "female" + + +class SexOrigin(enum.StrEnum): + AUTOMATIC = "auto" + SAMPLESHEET = "samplesheet" + CONFIG = "config" + + +class Sex(SnappyModel): + source: SexOrigin = SexOrigin.AUTOMATIC + default: SexValue | None = None + + @model_validator(mode="after") + def ensure_default_value(self): + if self.source == SexOrigin.CONFIG and not self.default: + raise ValueError("Undefined default sex value in configuration file") + return self diff --git a/snappy_pipeline/models/purecn.py b/snappy_pipeline/models/purecn.py new file mode 100644 index 000000000..878083413 --- /dev/null +++ b/snappy_pipeline/models/purecn.py @@ -0,0 +1,293 @@ +import enum +from typing import Annotated + +from pydantic import Field, model_validator + +from snappy_pipeline.models import SnappyModel + + +# Parameters for each action & those shared between actions +# param_table = { +# "shared": { +# "genome": Enum, +# "seed": int, +# }, +# "IntervalFile": { +# "off_target": bool, +# "average_target_width": float, +# "min_target_width": float, +# "small_targets": Enum, +# "off_target_seqlevels": Enum, +# "min_mappability": list, +# "average_reptiming_width": float, +# }, +# "Coverage": { +# "keep_duplicates": bool, +# "remove_mapq0": bool, +# "skip_gc_norm": bool, +# }, +# "NormalDB": { +# "genomicsdb_af_field": str, +# "min_normals_position_specific_fit": float, +# }, +# "PureCN": { +# "sex": Enum, +# "min_af": float, +# "error": float, +# "base_quality_offset": int, +# "min_supporting_reads": int, +# "db_info_flag": str, +# "popaf_info_field": str, +# "cosmic_cnt_info_field": str, +# "min_cosmic_cnt": int, +# "interval_padding": int, +# "min_total_counts": int, +# "min_fraction_offtarget": float, +# "fun_segmentation": Enum, +# "alpha": float, +# "undo_sd": str, +# "changpoints_penalty": int, +# "additional_cmd_args": str, +# "max_segments": int, +# "min_logr_sdev": float, +# "min_purity": float, +# "max_purity": float, +# "min_ploidy": float, +# "max_ploidy": float, +# "max_copy_number": int, +# "post_optimize": bool, +# "bootstrap_n": int, +# "speedup_heuristics": int, +# "model_homozygous": bool, +# "model": Enum, +# "max_non_clonal": float, +# "max_homozygous_loss": list, +# }, +# } + + +class Genome(enum.StrEnum): + HG18 = "hg18" + HG19 = "hg19" + HG38 = "hg38" + MM9 = "mm9" + MM10 = "mm10" + RN4 = "rn4" + RN5 = "rn5" + RN6 = "rn6" + CANFAM3 = "canFam3" + + +class SmallTargets(enum.StrEnum): + RESIZE = "resize" + DROP = "drop" + + +class OffTargetSeqLevels(enum.StrEnum): + TARGETED = "targeted" + ALL = "all" + + +class FilterMethod(enum.StrEnum): + AMPDEL = "ampdel" + CN = "cn" + CI = "ci" + SEM = "sem" + + +class CallingMethod(enum.StrEnum): + THRESHOLD = "threshold" + CLONAL = "clonal" + NONE = "none" + + +class IntervalFile(SnappyModel): + off_target: bool = False + """Include off-target regions""" + average_target_width: int = 400 + """Split large targets to approximately that size""" + min_target_width: int = 100 + """Either resize or drop targets smaller than specified""" + small_targets: SmallTargets = SmallTargets.RESIZE + """Either 'resize' or 'drop' small targets""" + average_off_target_width: int = 200000 + """Bin off-target regions to approximately that size""" + off_target_seqlevels: OffTargetSeqLevels = OffTargetSeqLevels.TARGETED + """Controls how to deal with chromosomes/contigs not found in baits""" + mappability: Annotated[ + str, + Field( + examples=[ + "/data/cephfs-1/work/groups/cubi/projects/biotools/PureCN/hg38/GCA_000001405.15_GRCh38_no_alt_analysis_set_100.bw", + "/data/cephfs-1/work/groups/cubi/projects/biotools/PureCN/hg19/wgEncodeCrgMapabilityAlign75mer.bigWig", + ] + ), + ] = "" + """``rtracklayer``-parsable file with mappability scores in 1st metadata column""" + min_mappability: tuple[float, float, float] = (0.6, 0.1, 0.7) + """Minimum mappability for on-target, off-target and chrY regions""" + reptiming: Annotated[ + str, + Field( + examples=[ + "/data/cephfs-1/work/groups/cubi/projects/biotools/PureCN/hg19/wgEncodeUwRepliSeqK562WaveSignalRep1.bigWig", + "", + ] + ), + ] = "" + """``rtracklayer``-parsable file with replication timing scores in 1st metadata column""" + average_reptiming_width: int = 100000 + """Average the replication timing data into bins of the specified size""" + exclude: str | None = None + """File parsable by rtracklayer specifying baits that should be excluded from baits file""" + + +class Coverage(SnappyModel): + keep_duplicates: bool = False + """SCount reads marked as duplicates""" + remove_mapq0: bool = False + """Not count reads marked with mapping quality 0""" + skip_gc_norm: bool = False + """Skips GC-normalization""" + + +class NormalDB(SnappyModel): + genomicsdb_af_field: str = "AF" + """Info field name where the allelic fraction is stored""" + min_normals_position_specific_fit: float = 10.0 + """Only change if you know what you are doing""" + + +class PureCNBase(SnappyModel): + genome: Genome + """Genome version. One of hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3""" + seed: int | None = None + """Seed for random number generator""" + + +class PureCNPon(PureCNBase): + intervals: IntervalFile = IntervalFile() + normaldb: NormalDB = NormalDB() + coverage: Coverage = Coverage() + + +class Variant(SnappyModel): + min_af: float = 0.03 + """minimum allelic fraction""" + snp_blacklist: str | None = None + """File parsable by rtracklayer that defines blacklisted regions""" + error: float = 0.001 + """Estimated default sequencing error rate for artifact filtering. Can be overriden by base quality scores""" + base_quality_offset: int = 1 + """Subtracts the specified value from the base quality score""" + min_supporting_reads: int | None = None + """Instead of calculating the min. number of supporting reads, use specified one""" + db_info_flag: str = "DB" + """VCF INFO flag indicating presence in common germline databases""" + popaf_info_field: str = "POP_AF" + """VCF INFO field providing population allele frequency""" + cosmic_cnt_info_field: str = "Cosmic.CNT" + """VCF INFO field providing counts in the Cosmic database""" + cosmic_vcf_file: str | None = None + """Adds a Cosmic.CNT INFO annotation using a Cosmic VCF. Added for convenience, we recommend adding annotations upstream""" + min_cosmic_cnt: int = 6 + """Min number of COSMIC hits""" + interval_padding: int = 50 + """Keep variants in the flanking region of specified size""" + + +class IntervalFilter(SnappyModel): + min_total_counts: int = 100 + """Keep only intervals with at least that many counts in both tumor and (tanget) normal""" + min_fraction_offtarget: float = 0.05 + """Ignore off-target internals when only the specified fraction of all intervals are off-target intervals""" + + +class SegmentationMethod(enum.StrEnum): + CBS = "CBS" + PSCBS = "PSCBS" + GATK4 = "GATK4" + HCLUST = "Hclust" + + +class Segmentation(SnappyModel): + enabled: bool = True + method: SegmentationMethod = SegmentationMethod.CBS + alpha: float = 0.005 + """Significance of breakpoints""" + undo_sd: str | None = None + """DNAcopy undo.SD argument. If None, tries to find a sensible default""" + changepoints_penalty: float | None = None + """GATK4 ModelSegments --number-of-changepoints-penalty-factor argument. If NULL, tries to find a sensible default""" + additional_cmd_args: str = "" + """Used in GATK4 segmentation function to add additional ModelSegments arguments""" + max_segments: int = 300 + """Flag noisy samples with many segments""" + min_logr_sdev: float = 0.15 + """Set minimum log-ratio standard deviation to this value. Useful when uncorrected biases exceed the log-ratio noise""" + + seg_file: str | None = None + """External segmentation file (from cnvkit, for example)""" + + @model_validator(mode="after") + def ensure_args_gatk4(self): + if self.changepoints_penalty or self.additional_cmd_args: + if self.method != SegmentationMethod.GATK4: + raise ValueError( + "Segmentation method 'GATK4' must be selected when parameters 'changepoints_penalty' or 'additional_cmd_args' are set" + ) + return self + + @model_validator(mode="after") + def ensure_segmentation(self): + if self.enabled and self.seg_file is not None: + raise ValueError("Segmentation cannot be enabled when a segmentation file is provided") + if not self.enabled and not self.seg_file: + raise ValueError("Segmentation must be either enabled or provided using 'seg_file'") + return self + + +class Model(enum.StrEnum): + BETA = "beta" + BETABIN = "betabin" + + +class PureCN(PureCNBase): + min_mapq: int = 0 + """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" + min_purity: float = 0.15 + """Minimum considered purity""" + max_purity: float = 0.95 + """Maximum considered purity""" + min_ploidy: float = 1.4 + """Minimum considered loidy""" + max_ploidy: float = 6.0 + """Maximum considered ploidy""" + max_copy_number: int = 7 + """Maximum allele-specific integer copy number""" + post_optimize: bool = False + """Post-optimization""" + bootstrap_n: int = 0 + """Number of bootstrap replicates""" + speedup_heuristics: float = 2.0 + """Tries to avoid spending computation time on unlikely local optima""" + homozygous_model: bool = False + """Model homozygous variants in very pure samples. Should be 'model_homozygous', but model_* doesn't play well with pytest""" + fit_model: Model = Model.BETA + """Model used to fit variants. Either beta or betabin. Should be 'model', but model_* doesn't play well with pytest""" + log_ratio_calibration: float = 0.1 + """Parameter defining the extend to which log-ratios might be miscalibrated""" + max_non_clonal: float = 0.2 + """Maximum genomic fraction assigned to a subclonal copy number state""" + max_homozygous_loss: tuple[float, float] = (0.05, 10000000.0) + """Maximum genomic fraction assigned to a complete loss and maximum size of a loss in bp""" + + log_ratio_file: str | None = None + """External log2 copy number ratio file""" + + # TODO: allow PureCN to merge all tumors from the same donor + additional_tumors: list[str] = [] + """tumor coverages from additional biopsies from the SAME patient, GC-normalized""" + + interval_filter: IntervalFilter = IntervalFilter() + segmentation: Segmentation = Segmentation() diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 7397391ef..84eba37a8 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -2,13 +2,12 @@ """Implementation of the ``somatic_cnv_calling`` step This step allows for the detection of CNV events for cancer samples from targeted sequenced (e.g., -exomes or large panels) or whole genome sequencing. +exomes or large panels) or whole genome sequencing. Panel sequencing is not implemented yet, it might be in a later release. The wrapped tools start from the aligned reads (thus off ``ngs_mapping``) and generate CNV calls for somatic variants. The wrapped tools implement different strategies. Some work "reference free" and just use the somatic BAM files for their input, some work in "matched cancer normal mode" and need the cancer -and normal BAM files, others again cancer BAM files, and additionally a -set of non-cancer BAM files for their background (the panel of normals). +and normal BAM files, and finally others use a set of non-cancer BAM files for their background (the panel of normals). Some tools may also use germline & somatic variants to estimate allele-specific copy number changes, and resolve loss-of-heterozygocity. In this case, the small variants need to be computed separately from the ``somatic_variant_calling`` step. @@ -25,45 +24,53 @@ Tools that use panel of normals can obtain their input in two different ways: -- A static file, from another cohort or from public datasets. +- A static file or files, from another cohort or from public datasets. In this case, the user is responsible to make sure that the data & methods used to create the panel are compatible to the cohort's. - The ``panel_of_normals`` step. - The panel will be created if necessary, using the same conditions that for the cohort (genome release, exome kit assignment, ...) + The panel will be created if necessary, using data from normal samples from the cohort. -When requested, the optional germline and somatic small variant calls are created using a modified version of the ``somatic_variant_calling`` step. -The ``somatic__cnv_calling`` step generates the small variants (TODO: how exactly) and stores them (TODO: where exactly). +When requested, the optional germline and somatic small variant calls are created in the ``somatic_variant_calling`` step. +Once again, it is the responsability of the user to make sure that variants creeated in that way are suitable for CNV calling. Likewise, purity estimations can be automatically computed by the ``somatic__cnv_calling`` step, -to supplement or replace the estimations that may be provided in the samplesheet. +when estimations are not provided in the samplesheet or configuration file. =========== Step Output =========== TODO: The whole section of output needs revision. Main question is: what is the best format to encode CNAs? +``vcf`` is an possibility, the benefits are a (more or less) well-defined format, but the major drawback is +that (as far as I know), most CNV analysis tools (from ``R`` in particular) don't recognize this format (for CNAs). -There is no widely used standard to report copy number alterations. -In absence of a better solution, all CNV tools implemented in somatic pipeline output the segmentation table loosely following the `DNAcopy format `_.` -The copy number call may or may not be present, and the chromosome number is replaced by its name. -The segmentation output is in file ``output/../out/.._dnacopy.seg``. +Currently, the only implemented tool is ``cnvkit``. Therefore, the ``cnvkit`` output is left as produced by the software. + +------ +cnvkit +------ + +The structure of the output is: :: output/ +-- bwa.cnvkit.P001-N1-DNA1-WES1 | |-- out - | | |-- bwa.cnvkitP001-N1-DNA1-WES1_dnacopy.seg + | | |-- bwa.cnvkitP001-N1-DNA1-WES1. [...] -Note that tool ``cnvetti`` doesn't follow the snappy convention above: -the tool name is followed by an underscore & the action, where the action is one of ``coverage``, ``segment`` and ``postprocess``. -For example, the output directory would contain a directory named ``bwa.cnvetti_coverage.P002-T1-DNA1-WES1``. +There are 4 main outputs: -.. note:: Tool-Specific Output +- The ratios (extension ``.cnr``) contains the ratio of expected coverage between tumor and the reference + in each bin, or logarithmic scale. + This can be used to examine the data or experiment with different segmentation algorithms. +- The segments (extension ``.segments.cns``) contains the output of the segmentation. A single log2 ratio value is + attributed to each segment. + The segmentation covers most of the part of the genome accessible to mapping. +- The calls (extension ``calls.cns``) contains only the non-diploid segments, called after thresholding. +- The results of differential coverage tests by bins (extension ``bintest.cns``). Only significant tests are listed. - Each tool produces its own set of outputs, generally not in standard format. - Some of these files are linked from ``work`` to ``output``, but not necessarily all of them. - Some tools (for example ``cnvkit``) also produces a report, with tables and figures. +Reports & plots are also available on user's request, found in ``report`` and ``plot`` sub-directories. ===================== @@ -72,27 +79,24 @@ The default configuration is as follows. -.. include:: DEFAULT_CONFIG_somatic_targeted_seq_cnv_calling.rst +.. include:: DEFAULT_CONFIG_somatic_cnv_calling.rst ===================================== Available Somatic Targeted CNV Caller ===================================== - ``cnvkit`` (for both WGS & WES) -- ``sequenza`` (only WES) -- ``purecn`` (only WES) -- ``Control-FREEC`` (only WGS - this tools might not be supported) -================================ -Logic of the step for ``cnvkit`` -================================ +=========================================== +Description of the step for ``cnvkit`` tool +=========================================== -------- Overview -------- ``cnvkit`` was designed to call CNV on whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), -and the _antitargets_ (those regions outside of enrichment). +and the _antitargets_ (those regions accessible for CNV calling, but outside of enrichment). The coverage of _targets_ and _antitargets_ are expected to be very different, but there is still information to be gained in the _antitarget_ regions, albeit at a much lower resolution than for _target_ regions. @@ -100,67 +104,93 @@ ``cnvkit`` was later used with some success on whole genome data. WGS data was defined as _target_ regions covering the whole genome, with empty _antitarget_ regions. ------------------------- -Sample-independent files ------------------------- - -``cvnkit`` allows the user to define _accessible_ regions (_via_ the ``access`` bed file). -This excludes repeats, low complexity or PAR regions, that cannot be properly mapped, and therefore used for CNV calling. - -For exome data, the _target_ regions are supposed to be well curated, so they are not affected by the _access_ regions. -The _antitarget_ regions, however, are only defined within _accessible_ regions. -For WGS data, the _antitarget_ regions are empty, and the _target_ regions are set to the _accessible_ regions, when present. -Even in the absence of user-defined _accessible_ regions, the _target_ and _antitarget_ regions will not contain long ``N`` sequences. - -Finally, the pipeline builds separates ``bed`` files for _target_ and _antitarget_ regions, for each exome kit present in the cohort, -and for WGS data if there is any. - ---------- -Reference ---------- - -The ``cnvkit`` authors recommend to use a panel of normals to normalize the coverage over bins. -This is usually created by running the ``panel_of_normals`` step. -The ``somatic_cnv_calling`` step will create a reference (panel of normals) if requested. -Otherwise, it is possible to use references created for different cohorts, but the user -must ensure that the data & methods used for the current cohort and to create the reference are compatible. -In particular, the exome enrichment kit must be identical, and the sex of the donors should be -similar (not to use a female-only reference for a male cohort, for example). - -If there are not enough normal samples to create such a reference, the corresponding normal sample -can be used, in a normal/tumor pair setting similar to the somatic small variant calling situation. +.. tip:: For Agilent kits, the ``cnvkit`` authors recommend to use baits as targets. Baits are in the ``*_Covered.bed`` file. + +--------------------------------- +Regions accessible to CNV calling +--------------------------------- + +``cnvkit`` needs to know about the regions of the genome accessible to CNV calling. +Typically, regions masked with ``N`` are excluded, but a user may also want to exclude +repeats, segmental duplications, or low complexity regions. + +There are multiple ways to get this information: + +1. The user can provide a ``bed`` file detailing accessible regions, using the ``path_access`` option in the ``access`` part of the configuration. +2. The user can specifically exclude regions using the ``exclude`` option is the ``access`` part of the configuration. + In this case, the pipeline will create an accessible regions file from the whole genome and the excluded parts. +3. Otherwise, the pipeline creates the accessible regions file from the reference genome sequence, by removing + only parts masked with ``N``. + +----------------------- +The reference coverage +----------------------- + +``cnvkit`` build a reference coverage to compensate locii effects when assessing coverage changes in tumor samples. +Because this reference is best constructed from multiple normal samples, the pipeline implements it under the +``panel_of_normals`` section. +The pipeline offers 4 different ways to built this reference: + +1. ``cohort``: the reference is taken from the pipeline's ``panel_of_normals`` step. + If it isn't there, it will be created, according to its configuration (which must be present). + The ``cnvkit`` authors suggest that `10 to 20 normal samples `_ + are sufficient to build a good reference. When there are more samples, the selected one should be taken from those with average file size. +2. ``file``: the reference is taken from another panel of normals, possibly from another cohort or from public data. + Beside the reference coverage itself, the target and antitarget bed files must also be provided. + Note that it is the user's responsability to make sure that the panel of normal is suitable for the cohort. +3. ``paired``: the reference is built only from one normal sample, paired with the tumor. + It is _not_ recommended by the ``cnvkit`` authors, but can be beneficial in some circumstances + (for example experimental designs with treated cell lines). +4. ``flat``: a _flat_ reference is computed, which discards locus-specific effects. + It should only be used when there are no normals nor suitable panel, or for benchmarking purposes. + +------------------ +WGS-specific notes +------------------ + +In WGS mode, the _antitarget_ regions (defined as accessible regions without target regions) are empty. + +But _target_ regions need to be carefully selected: the size of the bins used to identify CNAs must be +selected so that discovery of focal events is possible. +When the reference is taken from the current cohort, or from another panel of normals, the target regions +are taken from either panel of normals, and no other consideration is necessary. +But otherwise, it must be computed provided by the user (configuration option ``avg_size`` in the ``target`` section), +or from the available data. + +In the latter case, the algorithm is as follows: + +- When no normal sample data is available, then ``cvnkit`` default value is used. +- Otherwise, the value is computed by ``cnvkit``'s ``autobin`` module. + It is run in ``wgs`` mode if the access regions cover the complete genome + (no access file provided, no exclude files, and the ``min_gap_size`` parameter not set), + else it is run in ``amplicon`` mode. + Note that the latter should only be used when user-defined accessible regions are quite restricted + (limited to protein-coding exons devoid of repeats or low complexity regions, for example). + +----------------------------- +Other notes on implementation +----------------------------- + +.. note:: CNA calling on panel data is not implemented yet, even though ``cnvkit`` allows it in principle. + +.. note:: The current pipeline tries to replicate the behaviour of the ``batch`` module of ``cnvkit``, + while keeping the flexibility to diverge from it. + In particular, the possibility of obtaining the reference coverage from the paired normal is implemented. + +.. note:: The current implementation doesn't allow to mix multiple exome enrichment kits. + Future versions will hopefully lift this restriction. However, mixing WES, WGS & possibly panel data is + more challenging, and is not on the roadmap for future improvements. -In case no normals are available at all, a flat prior can be used. - ------------- -Calling CNVs ------------- - -The _target_ and _antitarget_ ``bed`` files created in the earlier sub-steps are used as input, -based on the exome kit (or WGS status). - -The coverage is computed for the tumor sample, and normalised using the reference. -As seen previously, the reference can be either exome kit-based, or sample-specific. - -The normalised coverage is the segmented, and copy numbers are called, optionally using -small variants and/or purity estimates. - -If B-allele fractions are used, the pipeline will create the small variants, only for samples -with a corresponding normal. -If purity is used, the user can choose to override the values in the sample sheet (when present) -with the output of the tool of her choice. """ import os import os.path import re -from copy import deepcopy -from enum import Enum -from typing import Callable, Iterator, Iterable, NamedTuple, Any +from typing import Callable, NamedTuple, Any -from biomedsheets.models import BioEntity, BioSample, TestSample, NGSLibrary -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background -from biomedsheets.io_tsv.base import LIBRARY_TYPES, LIBRARY_TO_EXTRACTION, EXTRACTION_TYPE_DNA +from biomedsheets.models import NGSLibrary +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions +from biomedsheets.io_tsv.base import LIBRARY_TO_EXTRACTION, EXTRACTION_TYPE_DNA from snakemake.io import OutputFiles, Wildcards, InputFiles from snappy_pipeline.utils import dictify @@ -172,37 +202,18 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_pipeline.models.common import Sex, SexOrigin, SexValue from snappy_pipeline.models.cnvkit import SegmentationMethod as CnvkitSegmentationMethod from .model import SomaticCnvCalling as SomaticCnvCallingConfigModel from .model import Cnvkit as CnvkitConfig -from .model import Sex, SexOrigin, SexValue, PanelOfNormalsOrigin, PurityOrigin, VariantOrigin +from .model import PanelOfNormalsOrigin, PurityOrigin, VariantOrigin __author__ = "Eric Blanc " #: Default configuration for the somatic_targeted_seq_cnv_calling step DEFAULT_CONFIG = SomaticCnvCallingConfigModel.default_config_yaml_string() -#: JSON key for "isCancer" -KEY_IS_CANCER = "isCancer" - -#: Value for "libraryType" is whole exome sequencing -VALUE_WES = "WES" - -#: Value for "libraryType" is panel sequencing -VALUE_PANEL = "Panel-seq" - -#: Values for targeted sequencing -VALUES_TARGETED_SEQ = (VALUE_WES, VALUE_PANEL) - -#: Standard key/extension values for BCF files -BCF_KEY_EXTS = ( - ("bcf", ".bcf"), - ("bcf_md5", ".bcf.md5"), - ("bcf_csi", ".bcf.csi"), - ("bcf_csi_md5", ".bcf.csi.md5"), -) - class SomaticCnvCallingStepPart(BaseStepPart): """Shared code for all caller classes in somatic_targeted_seq_cnv_calling""" @@ -210,19 +221,6 @@ class SomaticCnvCallingStepPart(BaseStepPart): def __init__(self, parent: "SomaticCnvCallingWorkflow"): super().__init__(parent) - @staticmethod - @dictify - def _get_log_file_from_prefix(prefix: str) -> Iterator[dict[str, str]]: - key_ext = ( - ("log", ".log"), - ("sh", ".sh"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), - ) - for key, ext in key_ext: - yield key, prefix + ext - yield key + "_md5", prefix + ext + ".md5" - class CnvKitStepPart(SomaticCnvCallingStepPart): """Perform somatic targeted CNV calling using cnvkit""" @@ -273,9 +271,7 @@ def __init__(self, parent: SomaticCnvCallingStepPart): self.tumors = {x.library.name: x for x in self.parent.tumors[self.libraryKit]} self.cfg: CnvkitConfig = self.config.get(self.name) - self.pon_source = ( - self.cfg.panel_of_normals.source if self.cfg.panel_of_normals.enabled else None - ) + self.pon_source = self.cfg.panel_of_normals.source self._set_cnvkit_pipeline_logic() @@ -311,16 +307,16 @@ def _set_cnvkit_pipeline_logic(self): Flat: based on targets & antitargets only Cohort: from panel_of_normals step File: from another cohort or public data (reference + target + antitarget [WES only]) - Paired (panel of normal disabled): reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) + Paired: reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) Therefore, a reference must be created for flat & paired choices (one reference per normal sample in the latter case). The logic to create the reference is (panel of normal is pon): - - access created if path_access is missing or average target size estimated + - access created if path_access is missing or average target size must be estimated - average target size estimated if value not in config and dataset is WGS - target created always - antitarget created when dataset is WES """ - self.paired = not self.cfg.panel_of_normals.enabled + self.paired = self.pon_source == PanelOfNormalsOrigin.PAIRED self.build_ref = self.paired or self.pon_source == PanelOfNormalsOrigin.FLAT self.compute_avg_target_size = ( self.is_wgs and self.paired and self.cfg.target.avg_size is None @@ -420,7 +416,7 @@ def get_output_files(self, action: str): "dataframe": self.base_out_lib + "rds", } case "call": - output_files = {"calls": self.base_out_lib + "cns"} + output_files = {"calls": self.base_out_lib + "calls.cns"} case "bintest": output_files = {"tests": self.base_out_lib + "bintest.cns"} case "metrics": @@ -502,7 +498,7 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: result_files = [] - for suffix in ("cnr", "segments.cns", "cns", "bintest.cns"): + for suffix in ("cnr", "segments.cns", "calls.cns", "bintest.cns"): result_files.append(base_out_lib + suffix) actions_to_log = ("fix", "segment", "call", "bintest") @@ -1058,7 +1054,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) "libraryKit", ) - self.matched_normal = self._match_normals() + self.matched_normal = self._match_normals(self.valid_dna_libraries) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -1099,10 +1095,10 @@ def get_result_files(self) -> OutputFiles: return OutputFiles(fns) - def _match_normals(self): + def _match_normals(self, valid_dna_libraries: list[LibraryInfo]) -> dict[str, str]: normals = SomaticCnvCallingWorkflow._split_by( SomaticCnvCallingWorkflow._filter_by( - self.valid_dna_libraries.values(), "is_tumor", lambda x: not x + valid_dna_libraries.values(), "is_tumor", lambda x: not x ), "libraryKit", ) @@ -1121,11 +1117,14 @@ def _match_normals(self): len(normal) < 2 ), f"Muliple valid donor samples for tumor library {sample.library.name}" if normal: + assert ( + normal[0].sex == sample.sex + ), f"Normal & tumor samples {normal[0].library.name} & {sample.library.name} from donor {donor} have different sex" normal_library = normal[0].library matched_normal[sample.library.name] = normal_library.name return matched_normal - def _optionally_register_subworkflow(self, subworkflow): + def _optionally_register_subworkflow(self, subworkflow: str): for tool in set(self.config.tools.wgs + self.config.tools.wes): assert self.config.get(tool) is not None, f"Requested tool '{tool}' not configured" cfg = self.config.get(tool) @@ -1159,7 +1158,7 @@ def _get_dna_libraries(sheet) -> dict[str, LibraryInfo]: purity = bio_sample.extra_infos.get("purity", None) ploidy = bio_sample.extra_infos.get("ploidy", 2) else: - purity = None + purity = 0 ploidy = 2 for test_sample in bio_sample.test_samples.values(): if ( @@ -1189,9 +1188,9 @@ def _get_dna_libraries(sheet) -> dict[str, LibraryInfo]: is_tumor, libraryType, libraryKit, + sex, purity, ploidy, - sex, ) return valid_dna_libraries diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py index f86472ba4..2d3aef2d9 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -1,11 +1,15 @@ import enum import typing from typing import Annotated -from pydantic import Field, model_validator # , validator +from pydantic import Field, model_validator from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel from snappy_pipeline.models.cnvkit import Cnvkit as CnvkitGeneric -from snappy_pipeline.models.library_kit import LibraryKitEntry +from snappy_pipeline.models.purecn import IntervalFilter +from snappy_pipeline.models.purecn import Segmentation as PureCNSegmentation +from snappy_pipeline.models.purecn import PureCN as PureCNBase +from snappy_pipeline.models.purecn import Variant as PureCNVariantParams +from snappy_pipeline.models.common import LibraryKitEntry, Sex class WgsCaller(enum.StrEnum): @@ -33,28 +37,6 @@ class SequencingMethod(enum.StrEnum): WGS = "wgs" -class SexValue(enum.StrEnum): - MALE = "male" - FEMALE = "female" - - -class SexOrigin(enum.StrEnum): - AUTOMATIC = "auto" - SAMPLESHEET = "samplesheet" - CONFIG = "config" - - -class Sex(SnappyModel): - source: SexOrigin = SexOrigin.AUTOMATIC - default: SexValue | None = None - - @model_validator(mode="after") - def ensure_default_value(self): - if self.source == SexOrigin.CONFIG and not self.default: - raise ValueError("Undefined default sex value in configuration file") - return self - - class PanelOfNormalsOrigin(enum.StrEnum): COHORT = "cohort" """Use (& possibly create) a panel of normals from the current cohort of normals in the panel_of_normals step""" @@ -65,23 +47,19 @@ class PanelOfNormalsOrigin(enum.StrEnum): FLAT = "flat" """Use a flat panel of normal (no panel of normals, actually)""" + PAIRED = "paired" + """Use the paired normal as reference (no panel of normal, actually)""" + class PanelOfNormals(SnappyModel): enabled: bool = False """Use panel of normals during CNV calling""" - source: PanelOfNormalsOrigin = PanelOfNormalsOrigin.FILE - """Which type of panel of normals should be used""" - - path_panel_of_normals: str = "" - """ - Path to panel of normals. - - The panel of normals can be either a file (typically from another project, or from the software's own data), - or the path to the pipeline's ```panel_of _normals``` step, depending on the choice of source. + source: PanelOfNormalsOrigin = PanelOfNormalsOrigin.COHORT + """Which type of panel of normals should be used, cohort is generally recommended""" - Note that there is no test that the panel of normals is suitable for that cohort. - """ + path_panel_of_normals: str = "../panel_of_normals" + """Path to panel of normals (used for cohort & file sources)""" @model_validator(mode="after") def ensure_panel_of_normals_path(self): @@ -111,13 +89,22 @@ class Variant(SnappyModel): """Use variants (somatic &/or germline) to improve CNV calling""" source: VariantOrigin = VariantOrigin.FILE - """Where are the variants obrained from""" + """Where are the variants obtained from""" - path_somatic_variant_calling: str = "" + path_somatic_variant_calling: Annotated[ + str, + Field( + examples=[ + "../somatic_variant_calling", + "../somatic_variant_calling_for_CNV", + "/public_data/common_variants.vcf.gz", + ] + ), + ] = "" """ Path to the variants to use for CNV calling. - The path can be either to the ```somatic_variant_calling``` step in the pipeline, if "cohort" is selected, + The path can be either to the ``somatic_variant_calling`` step in the pipeline, if "cohort" is selected, or to the vcf file with the variants when "file" is selected as source. """ @@ -145,7 +132,11 @@ class ControlFreec(SnappyModel): pass -class PureCn(SnappyModel): +class VariantPureCN(Variant, PureCNVariantParams): + pass + + +class PureCN(PureCNBase): panel_of_normals: PanelOfNormals = PanelOfNormals() """ Panel of normals created by the NormalDB.R script. @@ -159,42 +150,16 @@ def restrict_pon_mode(self) -> typing.Self: return self path_target_interval_list_mapping: list[LibraryKitEntry] = [] - """ - Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect) - """ + """Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect)""" - somatic_variant_calling: Variant = Variant() - - mappability: str = "" - """ - GRCh38: - /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw - """ - - reptiming: str = "" - """Nothing for GRCh38""" + sample_sex: Sex = Sex() - seed: int = 1234567 - extra_commands: typing.Dict[str, typing.Any] = { - "model": "betabin", - "fun-segmentation": "PSCBS", - "post-optimize": "", - } - """Recommended extra arguments for PureCN, extra_commands: {} to clear them all""" + somatic_variant_calling: VariantPureCN = VariantPureCN() path_container: Annotated[ str, Field(examples=["../panel_of_normals/work/containers/out/purecn.simg"]) - ] - """Conda installation not working well, container is required""" - - path_intervals: Annotated[ - str, - Field( - examples=[ - "../panel_of_normals/output/purecn/out/_.list" - ] - ), - ] + ] = "" + """Conda installation not working well, container is required. When missing the container is downloaded""" class PurityTool(enum.StrEnum): @@ -240,6 +205,8 @@ def ensure_valid_params_for_source(self): class PanelOfNormalsCnvkit(PanelOfNormals): + enabled: bool = True + """Reset enabled value, cnvkit always needs a panel of normal (even flat or paired)""" path_targets: str | None = None """Path to target file (used only when pon is obtained from file, taken from pipeline step otherwise)""" path_antitargets: str | None = None @@ -247,7 +214,7 @@ class PanelOfNormalsCnvkit(PanelOfNormals): @model_validator(mode="after") def ensure_paths_target_antitarget(self): - if self.enabled and self.source == PanelOfNormalsOrigin.FILE: + if self.source == PanelOfNormalsOrigin.FILE: if self.path_targets is None or self.path_antitargets is None: raise ValueError( "When using a previous pon, target & antitarget files must be defined" @@ -294,6 +261,6 @@ class SomaticCnvCalling(SnappyStepModel): """Tools for WGS & WES data""" cnvkit: Cnvkit | None = None - purecn: PureCn | None = None + purecn: PureCN | None = None sequenza: Sequenza | None = None control_freec: ControlFreec | None = None diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index ca0769c0b..e6bfbd2ae 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -21,8 +21,8 @@ if "variants" in args: variants = r""" ---vcf {args[variants]} \ - --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ - --min-variant-depth {args['min-variant-depth']} {zygocity_freq} + --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ + --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -34,12 +34,12 @@ cmd = r""" cnvkit.py call \ -o {snakemake.output.calls} \ - --method {args['method']} {thresholds} \ + --method {args[method]} {thresholds} \ {filter} \ {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} \ {purity} {ploidy} \ {variants} \ - {args['segments']} + {args[segments]} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index cef6277bd..303127160 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -20,9 +20,9 @@ cmd = r""" cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {args['reference']} \ - --min-mapq {args[min_mapq]} {count} \ - {args['bam']} {args['intervals']} + --fasta {args[reference]} \ + --min-mapq {args[min-mapq]} {count} \ + {args[bam]} {args[intervals]} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 554a87222..a5b56c8e4 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -17,19 +17,25 @@ args = snakemake.params.get("args", {}) +# Fix requires empty antitarget file in WGS & Panel modes +create_dummy_antitarget = "" +if args.get("antitarget", "") == "": + args["antitarget"] = "$TMPDIR/antitarget.bed" + create_dummy_antitarget = f"touch {args['antitarget']} ; " + cmd = r""" cnvkit.py fix \ -o {snakemake.output.ratios} \ - {cluster} --sample-id {args['sample-id']} \ + {cluster} --sample-id {args[sample-id]} \ {no_gc} {no_edge} {no_rmask} \ - {args['target']} {antitarget} {args['reference']} + {args[target]} {args[antitarget]} {args[reference]} """.format( snakemake=snakemake, + args=args, cluster="--cluster" if args.get("cluster", False) else "", no_gc="--no-gc" if args.get("no-gc", False) else "", no_edge="--no-edge" if args.get("no-edge", False) else "", no_rmask="--no-rmask" if args.get("no-rmask", False) else "", - antitarget=f"{args['antitarget']}" if "antitarget" in args else "", ) -CnvkitWrapper(snakemake, cmd).run() +CnvkitWrapper(snakemake, create_dummy_antitarget + cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py index a386a2dec..c4b475f1b 100644 --- a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py @@ -8,7 +8,7 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper @@ -18,8 +18,8 @@ if "variants" in args: variants = r""" ---vcf {args[variants]} \ - --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ - --min-variant-depth {args['min-variant-depth']} {zygocity_freq} + --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ + --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -31,14 +31,14 @@ cmd = r""" cnvkit.py scatter \ -o {snakemake.output.plot} \ - --segment {args['segments']} \ + --segment {args[segments]} \ {chromosome} {gene} {range_list} \ - --width {args['width']} \ - --antitarget-marker {args['antitarget-marker']} --segment-color {args['segment-color']} \ - {by_bin} {trend} --title {args['title']} \ + --width {args[width]} \ + --antitarget-marker {args[antitarget-marker]} --segment-color {args[segment-color]} \ + {by_bin} {trend} --title "{args[title]}" \ {y_min} {y_max} {fig_size} \ {variants} \ - {args['ratios']} + {args[ratios]} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py index 24a3f5a7e..a50a32e7e 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -7,7 +7,7 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper @@ -17,8 +17,8 @@ cmd = r""" cnvkit.py genemetrics \ -o {snakemake.output.report} \ - --segment {args['segments']} \ - --threshold {args['threshold']} --min-probes {args['min-probes']} \ + --segment {args[segments]} \ + --threshold {args[threshold]} --min-probes {args[min-probes]} \ {drop_low_coverage} {male_reference} {sample_sex} {diploid_parx_genome} \ {stats} \ {args[ratios]} diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py index fe421a339..d4a2fdc92 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py @@ -7,23 +7,25 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper args = snakemake.params.get("args", {}) +# As segment files can appear multiple times, the `--segment` argument must be the last one cmd = r""" cnvkit.py metrics \ -o {snakemake.output.report} \ - --segment {args['segments']} \ {drop_low_coverage} \ - {args[ratios]} + {args[ratios]} \ + --segment {segments} """.format( snakemake=snakemake, args=args, drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + segments=" ".join(args["segments"]), ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py index d569588c2..76c6831e0 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py @@ -7,7 +7,7 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper @@ -17,8 +17,8 @@ cmd = r""" cnvkit.py segmetrics \ -o {snakemake.output.report} \ - --segment {args['segments']} \ - --alpha {args['alpha']} --bootstrap {args['bootstrap']} {smooth_bootstrap} \ + --segment {args[segments]} \ + --alpha {args[alpha]} --bootstrap {args[bootstrap]} {smooth_bootstrap} \ {drop_low_coverage} \ {stats} \ {args[ratios]} diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 2c02aa03c..240cdfa28 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -16,9 +16,9 @@ if "variants" in args: variants = r""" - ---vcf {args['variants']} \ - --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ - {args['min-variant-depth']} {zygocity_freq} + ---vcf {args[variants]} \ + --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ + {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -30,10 +30,10 @@ cmd = r""" cnvkit.py segment --processes {snakemake.resources._cores} \ -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ - --method {args['method']} --threshold {args['threshold']} {smooth_cbs} \ - {drop_low_coverage} --drop-outliers {args['drop-outliers']} \ + --method {args[method]} --threshold {args[threshold]} {smooth_cbs} \ + {drop_low_coverage} --drop-outliers {args[drop-outliers]} \ {variants} \ - {args[coverage]} + {args[ratios]} """.format( snakemake=snakemake, args=args, diff --git a/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py similarity index 99% rename from tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py rename to tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index de45fb4e1..fe954b333 100644 --- a/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -56,7 +56,7 @@ def minimal_config(): cnvkit: diploid_parx_genome: GRCh38 panel_of_normals: - enabled: False + source: paired somatic_variant_calling: enabled: True source: cohort @@ -464,7 +464,7 @@ def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segments.cns", "dataframe": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.rds", }, - "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns"}, + "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.calls.cns"}, "bintest": {"tests": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bintest.cns"}, "metrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.metrics.tsv"}, "genemetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.genemetrics.tsv"}, @@ -537,7 +537,7 @@ def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): tpl = "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" expected += [ tpl.format(mapper=mapper, library_name=library_name, ext=ext) - for ext in ("cnr", "segments.cns", "cns", "bintest.cns") + for ext in ("cnr", "segments.cns", "calls.cns", "bintest.cns") for library_name in tumor_libraries for mapper in ("bwa",) ] From 8bc46f8d320d38a7d0b4e0fd14040edba49b805c Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 28 Nov 2024 18:27:20 +0100 Subject: [PATCH 16/46] refactor: common functions to ignore contigs and for library kits --- snappy_pipeline/models/cnvkit.py | 8 +-- .../workflows/ngs_mapping/model.py | 2 +- snappy_wrappers/tools/genome_windows.py | 62 +++++++++++++++++++ tests/snappy_pipeline/workflows/conftest.py | 13 +++- 4 files changed, 79 insertions(+), 6 deletions(-) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 837f8b984..8f9de72df 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -105,7 +105,7 @@ class Access(SnappyModel): parameter for WGS data, unless the accessible regions are much reduced (for example excluding all intergenic regions, repeats, low complexity, ...) """ - + exclude: list[str] = [] """Regions accessible to mapping""" min_gap_size: int | None = None @@ -115,7 +115,7 @@ class Access(SnappyModel): class Target(SnappyModel): split: bool = True """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float | None = None + avg_size: int | None = None """ Average size of split target bins (results are approximate). @@ -131,9 +131,9 @@ class Target(SnappyModel): class Antitarget(SnappyModel): - avg_size: float = 150000 + avg_size: int = 150000 """Average size of split antitarget bins (results are approximate)""" - min_size: float | None = None + min_size: int | None = None """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 2d3db5724..281d83c85 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -7,7 +7,7 @@ from pydantic import Field, field_validator, model_validator from snappy_pipeline.models import EnumField, SizeString, SnappyModel, SnappyStepModel -from snappy_pipeline.models.library_kit import LibraryKit +from snappy_pipeline.models.common import LibraryKit class DnaMapper(Enum): diff --git a/snappy_wrappers/tools/genome_windows.py b/snappy_wrappers/tools/genome_windows.py index 906d32dc7..2236d6c42 100644 --- a/snappy_wrappers/tools/genome_windows.py +++ b/snappy_wrappers/tools/genome_windows.py @@ -12,8 +12,12 @@ import csv import fnmatch import os +import re import sys +from pathlib import Path + + # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. @@ -32,6 +36,12 @@ #: Allowed values for ``--format`` CHOICES_FORMAT = ("regions", "bed") +#: Regular expression patterns to parse *.fai, *.genome, *.dict & fastq files +PATTERN_FAI = re.compile(r"^([^\s]+)\t([0-9]+)\t([0-9]+)\t([0-9]+)\t([0-9]+)\s*$") +PATTERN_GENOME = re.compile(r"^([^\s]+)\t([0-9]+)\s*$") +PATTERN_DICT = re.compile(r"^@SQ\tSN:([^\s]+)\tLN:([0-9]+).*$") +PATTERN_FASTA = re.compile(r"^\s*>\s*([^\s]+).*$") + def matches_any(query, patterns): for pattern in patterns: @@ -78,6 +88,58 @@ def yield_regions(fai_file, window_size, subtract_end=0, ignore_chroms=None, pad begin = end +def ignore_chroms(path_ref: str, ignored: set[str] = [], return_ignored: bool = False): + path_ref = Path(path_ref).resolve() + if Path(str(path_ref) + ".fai").exists(): + contigs = _parse_index(Path(str(path_ref) + ".fai"), PATTERN_FAI) + elif Path(str(path_ref) + ".genome").exists(): + contigs = _parse_index(Path(str(path_ref) + ".genome"), PATTERN_GENOME) + elif path_ref.with_suffix("dict").exists(): + contigs = _parse_index(path_ref.with_suffix("dict"), PATTERN_DICT, True) + else: + contigs = _read_fasta(path_ref) + for contig_name, contig_length in contigs: + m = matches_any(contig_name, ignored) + if (m and return_ignored) or (not m and not return_ignored): + yield contig_name, contig_length + + +def _parse_index(filename: Path, pattern: re.Pattern, allow_mismatch: bool = False): + with open(filename, "rt") as f: + for line in f: + line = line.strip() + if len(line) == 0 or line.startswith("#"): + continue + m = pattern.match(line) + if m: + groups = m.groups() + yield groups[0], int(groups[1]) + else: + if not allow_mismatch: + raise ValueError(f"Unexpected record '{line}' in reference file '{filename}'") + + +def _read_fasta(filename: Path): + contig_name = None + contig_length = None + with open(filename, "rt") as f: + for line in f: + line = line.strip() + if len(line) == 0 or line.startswith("#"): + continue + m = PATTERN_FASTA.match(line) + if m: + if contig_name: + yield contig_name, contig_length + groups = m.groups() + contig_name = groups[0] + contig_length = 0 + else: + contig_length += len(line) + assert contig_name is not None, f"No contig found in reference file {filename}" + yield contig_name, contig_length + + def run(args): """Main entry point after parsing command line arguments""" yielded = 0 diff --git a/tests/snappy_pipeline/workflows/conftest.py b/tests/snappy_pipeline/workflows/conftest.py index 4e7d9de5c..dabd97704 100644 --- a/tests/snappy_pipeline/workflows/conftest.py +++ b/tests/snappy_pipeline/workflows/conftest.py @@ -907,7 +907,7 @@ def cancer_sheet_fake_fs_path_link_in(fake_fs, cancer_sheet_tsv): @pytest.fixture -def autobin_result_fake_fs(fake_fs, cancer_sheet_tsv): +def autobin_result_calling_fake_fs(fake_fs, cancer_sheet_tsv): """Return fake file autobin.txt""" # Create work directory fake_fs.fs.makedirs("/work", exist_ok=True) @@ -923,6 +923,17 @@ def autobin_result_fake_fs(fake_fs, cancer_sheet_tsv): return fake_fs +@pytest.fixture +def autobin_result_pon_fake_fs(fake_fs, cancer_sheet_tsv): + """Return fake file autobin.txt""" + # Create work directory + fake_fs.fs.makedirs("/work", exist_ok=True) + # Create autobin result for the samples + tpl = "/{mapper}.cnvkit/out/{mapper}.cnvkit.autobin.txt" + fake_fs.fs.create_file(tpl.format(mapper="bwa"), create_missing_dirs=True) + return fake_fs + + @pytest.fixture def purity_result_fake_fs(fake_fs, cancer_sheet_tsv): """Return fake file purity.txt""" From fac4cba1c69b7fac315004dc99818b7038591a18 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 28 Nov 2024 18:30:31 +0100 Subject: [PATCH 17/46] feat: improved logic for cnvkit panel of normals & somatic cnv calling, with updated wrappers --- .../workflows/panel_of_normals/Snakefile | 52 +- .../workflows/panel_of_normals/__init__.py | 669 +++++++++--------- .../workflows/panel_of_normals/model.py | 133 +--- .../workflows/somatic_cnv_calling/__init__.py | 52 +- .../workflows/somatic_cnv_calling/model.py | 8 +- .../wrappers/cnvkit/access/wrapper.py | 15 +- .../wrappers/cnvkit/antitarget/wrapper.py | 4 +- .../wrappers/cnvkit/autobin/wrapper.py | 4 +- .../wrappers/cnvkit/call/wrapper.py | 3 +- .../wrappers/cnvkit/reference/wrapper.py | 6 +- .../cnvkit/report/genemetrics/wrapper.py | 4 +- .../wrappers/cnvkit/sex/environment.yaml | 1 + .../wrappers/cnvkit/sex/wrapper.py | 33 + .../wrappers/cnvkit/target/wrapper.py | 4 +- .../test_workflows_panel_of_normals.py | 575 +++++---------- .../test_workflows_somatic_cnv_calling.py | 23 +- 16 files changed, 670 insertions(+), 916 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/sex/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/sex/wrapper.py diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 923431644..e311589c3 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -101,30 +101,10 @@ rule panel_of_normals_mutect2_create_panel: # Panel of normals (cnvkit) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Write out access file (if required, must be run prior to the cnvkit panel of normals) - - -rule panel_of_normals_access_run: - output: - **wf.get_output_files("access", "run"), - resources: - time=wf.get_resource("access", "run", "time"), - memory=wf.get_resource("access", "run", "memory"), - partition=wf.get_resource("access", "run", "partition"), - log: - **wf.get_log_file("access", "run"), - params: - **wf.get_args("access", "run"), - wrapper: - wf.wrapper_path("cnvkit/access") - - # Write out the normals-only results for the normals -------------------------- rule panel_of_normals_cnvkit_access: - input: - unpack(wf.get_input_files("cnvkit", "access")), output: **wf.get_output_files("cnvkit", "access"), threads: wf.get_resource("cnvkit", "access", "threads") @@ -233,22 +213,22 @@ rule panel_of_normals_cnvkit_create_panel: wf.wrapper_path("cnvkit/reference") -# rule panel_of_normals_cnvkit_report: -# input: -# unpack(wf.get_input_files("cnvkit", "report")), -# output: -# **wf.get_output_files("cnvkit", "report"), -# threads: wf.get_resource("cnvkit", "report", "threads") -# resources: -# time=wf.get_resource("cnvkit", "report", "time"), -# memory=wf.get_resource("cnvkit", "report", "memory"), -# partition=wf.get_resource("cnvkit", "report", "partition"), -# log: -# **wf.get_log_file("cnvkit", "report"), -# params: -# **{"args": wf.get_args("cnvkit", "report")}, -# wrapper: -# wf.wrapper_path("cnvkit/report") +rule panel_of_normals_cnvkit_sex: + input: + unpack(wf.get_input_files("cnvkit", "sex")), + output: + **wf.get_output_files("cnvkit", "sex"), + threads: wf.get_resource("cnvkit", "sex", "threads") + resources: + time=wf.get_resource("cnvkit", "sex", "time"), + memory=wf.get_resource("cnvkit", "sex", "memory"), + partition=wf.get_resource("cnvkit", "sex", "partition"), + log: + **wf.get_log_file("cnvkit", "sex"), + params: + **{"args": wf.get_args("cnvkit", "sex")}, + wrapper: + wf.wrapper_path("cnvkit/sex") # Panel of normals (PureCN) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 240643323..b919fd480 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -169,9 +169,12 @@ """ +import re + from enum import StrEnum from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions +from snakemake.io import Wildcards, InputFiles from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -182,12 +185,16 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_pipeline.models.common import SexOrigin, SexValue + from .model import PanelOfNormals as PanelOfNormalsConfigModel +from .model import PureCn as PureCnConfig +from .model import CnvKit as CnvkitConfig __author__ = "Manuel Holtgrewe " #: Names of the tools that might use panel of normals -TOOLS = ("mutect2", "cnvkit", "access", "purecn") +TOOLS = ("mutect2", "cnvkit", "purecn") #: Default configuration for the somatic_variant_calling schema DEFAULT_CONFIG = PanelOfNormalsConfigModel.default_config_yaml_string() @@ -214,18 +221,26 @@ def __init__(self, parent): super().__init__(parent) # Build shortcut from cancer bio sample name to matched cancer sample known_libraries = self._get_normal_libraries() - self.normal_libraries = list(known_libraries.keys()) + self.normal_libraries = known_libraries if self.name and (cfg := self.config.get(self.name)): if path := cfg.get("path_normals_list"): - self.normal_libraries = [] + self.normal_libraries = {} with open(path, "rt") as f: for line in f: if line.startswith("#"): continue - self.normal_libraries.append(line.strip()) - self.libraryType, self.libraryKit = self._validate_normal_libraries(known_libraries) - - def _get_normal_libraries(self): + library_name = line.strip() + assert ( + line in known_libraries.keys() + ), f"Unknown requested library {library_name}" + self.normal_libraries[library_name] = known_libraries[library_name] + self.libraryType, self.libraryKit = self._validate_normal_libraries() + + self.ignored = [] + if len(self.config.get("ignore_chroms", [])) > 0: + self.ignored += self.config.ignore_chroms + + def _get_normal_libraries(self) -> dict[str, dict[str, str]]: normal_libraries = {} for sheet in self.parent.shortcut_sheets: for donor in sheet.donors: @@ -239,34 +254,36 @@ def _get_normal_libraries(self): normal_libraries[library.name] = self._get_extra_info(library) return normal_libraries - def _validate_normal_libraries(self, known_libraries): + def _validate_normal_libraries(self) -> tuple[str, str]: + libraries = self.normal_libraries libraryType = None libraryKit = None - for library in self.normal_libraries: - assert ( - library in known_libraries - ), f"Unknown normal library {library} requested to build pon" + for library in libraries: assert ( - libraryType is None or libraryType == known_libraries[library]["libraryType"] + libraryType is None or libraryType == libraries[library]["libraryType"] ), "Panel of normal cannot be built from multiple library types" - libraryType = known_libraries[library]["libraryType"] + libraryType = libraries[library]["libraryType"] if libraryType == LibraryType.WES: assert ( - libraryKit is None or libraryKit == known_libraries[library]["libraryKit"] + libraryKit is None or libraryKit == libraries[library]["libraryKit"] ), "Panel of normal cannot be built from multiple library kits" - libraryKit = known_libraries[library]["libraryKit"] + libraryKit = libraries[library]["libraryKit"] return (libraryType, libraryKit) @staticmethod - def _get_extra_info(library): + def _get_extra_info(library) -> dict[str, str]: extra_info = {} assert "libraryType" in library.extra_infos, f"Undefined type of library {library.name}" - extra_info["libraryType"] = library.extra_infos.get("libraryType", "Illumina") + assert ( + library.extra_infos.get("libraryType") in LibraryType + ), f"Unknown library type {library.extra_infos.get('libraryType')}" + extra_info["libraryType"] = library.extra_infos.get("libraryType") if extra_info["libraryType"] == LibraryType.WES: assert ( "libraryKit" in library.extra_infos ), f"Undefined exome kit for library {library.name}" extra_info["libraryKit"] = library.extra_infos.get("libraryKit", "__default__") + extra_info["sex"] = library.parent.parent.parent.extra_infos.get("sex", None) return extra_info @staticmethod @@ -292,26 +309,10 @@ class PureCnStepPart(PanelOfNormalsStepPart): #: Resources resource_usage = { - "install": ResourceUsage( - threads=1, - time="01:00:00", - memory="24G", - ), - "prepare": ResourceUsage( - threads=1, - time="04:00:00", # 4 hours - memory="24G", - ), - "coverage": ResourceUsage( - threads=1, - time="04:00:00", # 4 hours - memory="24G", - ), - "create_panel": ResourceUsage( - threads=1, - time="12:00:00", # 12 hours - memory="32G", - ), + "install": ResourceUsage(threads=1, time="01:00:00", memory="24G"), + "prepare": ResourceUsage(threads=1, time="04:00:00", memory="24G"), + "coverage": ResourceUsage(threads=1, time="04:00:00", memory="24G"), + "create_panel": ResourceUsage(threads=1, time="12:00:00", memory="32G"), } def get_input_files(self, action): @@ -330,7 +331,7 @@ def _get_input_files_coverage(self, wildcards): yield ( "intervals", "work/purecn/out/{}_{}.list".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ), ) @@ -345,7 +346,7 @@ def _get_input_files_create(self, wildcards): "normals", [ tpl.format(mapper=wildcards.mapper, library_name=lib) - for lib in self.normal_libraries + for lib in self.normal_libraries.keys() ], ) @@ -358,7 +359,7 @@ def get_output_files(self, action): return {"container": "work/containers/out/purecn.simg"} if action == "prepare": base_out = "{}_{}".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ) return { @@ -391,7 +392,7 @@ def get_log_file(self, action): tpls = { "install": "work/containers/log/purecn", "prepare": "work/purecn/log/{}_{}".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ), "coverage": "work/{mapper}.purecn/log/{mapper}.purecn.{library_name,.+-DNA[0-9]+-WES[0-9]+}", @@ -412,16 +413,8 @@ class Mutect2StepPart(PanelOfNormalsStepPart): #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). resource_usage = { - "prepare_panel": ResourceUsage( - threads=2, - time="3-00:00:00", # 3 days - memory="8G", - ), - "create_panel": ResourceUsage( - threads=2, - time="48:00:00", # 48 hours - memory="30G", - ), + "prepare_panel": ResourceUsage(threads=2, time="3-00:00:00", memory="8G"), + "create_panel": ResourceUsage(threads=2, time="48:00:00", memory="30G"), } def check_config(self): @@ -454,7 +447,7 @@ def _get_input_files_create_panel(self, wildcards): """Helper wrapper function for merging individual results & panel creation""" paths = [] tpl = "work/{mapper}.{tool}/out/{mapper}.{tool}.{normal_library}.prepare.vcf.gz" - for normal in self.normal_libraries: + for normal in self.normal_libraries.keys(): paths.append(tpl.format(normal_library=normal, tool=self.name, **wildcards)) return {"normals": paths} @@ -503,318 +496,347 @@ class CnvkitStepPart(PanelOfNormalsStepPart): "antitarget", "coverage", "create_panel", - "report", + "sex", ) - #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). - resource_usage = { - "target": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="8G", - ), - "antitarget": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="8G", - ), - "coverage": ResourceUsage( - threads=8, - time="02:00:00", # 2 hours - memory="16G", - ), - "create_panel": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="16G", - ), - "report": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="16G", - ), - } + # Overwrite defaults + default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h + resource_usage = {"coverage": ResourceUsage(threads=8, time="11:59:59", memory="7680M")} def __init__(self, parent): super().__init__(parent) - def check_config(self): - if self.name not in self.config.tools: - return # cnvkit not enabled, skip - self.parent.ensure_w_config( - ("static_data_config", "reference", "path"), - "Path to reference FASTA not configured but required for %s" % (self.name,), + if self.name in self.config.tools: + self.is_wgs = self.libraryType == LibraryType.WGS + self.is_wes = self.libraryType == LibraryType.WES + + self.cfg: CnvkitConfig = self.config.get(self.name) + + self.ignored += self.cfg.ignore_chroms + self.ignored = set(self.ignored) + + self._set_cnvkit_pipeline_logic() + + self.path_baits = self._get_path_baits() + + self.base_out = "work/{mapper}.cnvkit/out/{mapper}.cnvkit." + self.base_out_lib = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." + ) + + def _set_cnvkit_pipeline_logic(self): + """ + Creates instance variables to choose path in cnvkit pipeline + + Access: regions accessible for CNV calling (unmasked) + path_access or when missing build from genome reference + optional list of excluded region + + Target: regions of good coverage + From baits (WES) or accessible regions (WGS) + estimate of target size from config or autobin step + + Antitarget: regions of low coverage + antitarget = access - target, only WES, otherwise empty + + Reference: + Flat: based on targets & antitargets only + Cohort: from panel_of_normals step + File: from another cohort or public data (reference + target + antitarget [WES only]) + Paired: reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) + """ + self.compute_avg_target_size = self.is_wgs and self.cfg.target.avg_size is None + self.create_access = not self.cfg.path_access + self.plain_access = ( + not self.cfg.path_access + and len(self.cfg.access.exclude) == 0 + and self.cfg.access.min_gap_size is None ) + def _get_cohort_sex(self) -> SexValue | None: + match self.cfg.sample_sex.source: + case SexOrigin.CONFIG: + return self.cfg.sample_sex.default + case SexOrigin.AUTOMATIC: + return None + case SexOrigin.SAMPLESHEET: + sex = None + for library, extra_info in self.normal_libraries.items(): + if extra_info.get("sex", None) is None: + assert sex is None, f"Sex of library {library} not defined in samplesheet" + else: + if sex is None: + sex = SexValue(extra_info.get("sex")) + else: + assert sex == SexValue( + extra_info.get("sex") + ), "Multiple sex in the cohort, use 'auto' in sex source" + return sex + + def _get_path_baits(self) -> str | None: + if not self.is_wes: + return None + default = None + for item in self.cfg.path_target_interval_list_mapping: + if item.name == self.libraryKit: + return item.path + elif item.name == "__default__": + default = item.path + if default is None: + raise ValueError(f"Missing library kit definition for {self.libraryKit}") + return default + def get_input_files(self, action): """Return input files for cnvkit panel of normals creation""" # Validate action self._validate_action(action) - mapping = { - "access": self._get_input_files_access, - "autobin": self._get_input_files_autobin, - "target": self._get_input_files_target, - "antitarget": self._get_input_files_antitarget, - "coverage": self._get_input_files_coverage, - "create_panel": self._get_input_files_create_panel, - } - return mapping[action] + return getattr(self, "_get_input_files_{}".format(action.replace("/", "_"))) def get_args(self, action): - """Return panel of normal files""" - if action == "access": - return self._get_args_access - elif action == "autobin": - return self._get_args_autobin - elif action == "target": - return self._get_args_target - elif action == "antitarget": - return self._get_args_antitarget - elif action == "coverage": - return self._get_args_coverage - elif action == "create_panel": - return self._get_args_create_panel - else: - self._validate_action(action) + """Return parameters input function, dependent on rule""" + # Validate action + self._validate_action(action) + return getattr(self, "_get_args_{}".format(action.replace("/", "_"))) + @dictify def get_output_files(self, action): - """Return panel of normal files""" - output_files = None - if action == "access": - output_files = self._get_output_files_access() - elif action == "autobin": - output_files = self._get_output_files_autobin() - elif action == "target": - output_files = self._get_output_files_target() - elif action == "antitarget": - output_files = self._get_output_files_antitarget() - elif action == "coverage": - output_files = self._get_output_files_coverage() - elif action == "create_panel": - output_files = self._get_output_files_create_panel() + """Return panel of normal output files""" + self._validate_action(action) + output_files = {} + match action: + case "access": + output_files = {"access": self.base_out + "access.bed"} + case "autobin": + output_files = {"result": self.base_out + "autobin.txt"} + case "target": + output_files = {"target": self.base_out + "target.bed"} + case "antitarget": + output_files = {"antitarget": self.base_out + "antitarget.bed"} + case "coverage": + output_files = {"coverage": self.base_out_lib + "{region,(target|antitarget)}.cnn"} + case "create_panel": + output_files = {"reference": self.base_out + "panel_of_normals.cnn"} + case "sex": + output_files = {"sex": self.base_out + "sex.tsv"} + + for k, v in output_files.items(): + yield k, v + yield k + "_md5", v + ".md5" + + @dictify + def get_log_file(self, action): + """Return panel of normal log files""" + # Validate action + self._validate_action(action) + + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." + base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." + if action in ("access", "autobin", "target", "antitarget", "create_panel", "sex"): + tpl = base_log + action + elif action in ("coverage",): + tpl = base_log_lib + "{region,(target|antitarget)}" else: - self._validate_action(action) - return dict( - zip( - list(output_files.keys()) + [k + "_md5" for k in output_files.keys()], - list(output_files.values()) + [v + ".md5" for v in output_files.values()], - ) - ) + raise ValueError(f"Logs of action '{action}' not implemented yet") - @classmethod - def get_log_file(cls, action): - """Return panel of normal files""" - tpls = { - "access": "work/{mapper}.cnvkit/log/cnvkit.access", - "autobin": "work/{mapper}.cnvkit/log/cnvkit.autobin", - "target": "work/{mapper}.cnvkit/log/cnvkit.target", - "antitarget": "work/{mapper}.cnvkit/log/cnvkit.antitarget", - "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage", - "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", - } - assert action in cls.actions - return cls._get_log_file(tpls[action], has_sh=True) + for key, ext in ( + ("conda_list", ".conda_list.txt"), + ("conda_info", ".conda_info.txt"), + ("log", ".log"), + ("sh", ".sh"), + ): + yield key, tpl + ext + yield key + "_md5", tpl + ext + ".md5" + + @listify + def get_result_files(self) -> list[str]: + if self.name not in self.config.tools: + return [] + + result_files = [] + + result_files += list(self.get_output_files("create_panel").values()) + result_files += list(self.get_log_file("create_panel").values()) + + result_files += list(self.get_output_files("target").values()) + result_files += list(self.get_log_file("target").values()) + + if self.libraryType == LibraryType.WES: + result_files += list(self.get_output_files("antitarget").values()) + result_files += list(self.get_log_file("antitarget").values()) + + result_files += list(self.get_output_files("sex").values()) + result_files += list(self.get_log_file("sex").values()) - def _get_input_files_access(self, wildcards): + return filter(lambda x: not x.endswith(".md5"), result_files) + + def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "Access shouldn't be created, already available" return {} - def _get_args_access(self, wildcards): - return { + def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "Access shouldn't be created, already available" + return dict(input) | { "reference": self.w_config.static_data_config.reference.path, - "min_gap_size": self.config.cnvkit.min_gap_size, + "min-gap-size": self.cfg.access.min_gap_size, + "exclude": self.cfg.access.exclude, + "ignore_chroms": list(self.ignored), } - def _get_output_files_access(self): - return {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"} - - def _get_input_files_autobin(self, wildcards): + def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: assert ( - self.libraryType == LibraryType.WGS + self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" - return { + input_files = { "bams": [ ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) - for x in self.normal_libraries - ], - "access": "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards), + for x in self.normal_libraries.keys() + ] } + if self.create_access: + if self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + return input_files - def _get_args_autobin(self, wildcards): + def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert ( - self.libraryType == LibraryType.WGS + self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" - return {"method": "wgs", "bp_per_bin": 50000} - - def _get_output_files_autobin(self): - return {"result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt"} + args = dict(input) | {"bp-per-bin": 50000} + if self.plain_access: + args["method"] = "wgs" + else: + args["method"] = "amplicon" + if "target" not in args: + args["target"] = self.cfg.path_access + return args - def _get_input_files_target(self, wildcards): + def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: """Helper wrapper function to estimate target average size in wgs mode""" input_files = {} - if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": - input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) - if self.config.cnvkit.get("target_avg_size", None) is None: - input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( - **wildcards - ) + if self.is_wgs: + if self.create_access: + input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" + if self.compute_avg_target_size: + input_files["avg-size"] = self.base_out.format(**wildcards) + "autobin.txt" return input_files - def _get_args_target(self, wildcards): - params = {} - if self.name in self.config.tools: - if self.libraryType == LibraryType.WES: - params["target"] = self.config.cnvkit.path_target_regions - if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") != "": - params["target"] = self.config.cnvkit.get("access") - if self.w_config.static_data_config.get("features", None): - params["annotate"] = self.w_config.static_data_config.features.path - if self.config.cnvkit.get("split", True): - params["split"] = True - if self.config.cnvkit.get("target_avg_size", None): - params["avg_size"] = self.config.cnvkit.get("target_avg_size") - return params - - def _get_output_files_target(self): - return {"target": "work/{mapper}.cnvkit/out/cnvkit.target.bed"} - - def _get_input_files_antitarget(self, wildcards): - """Helper wrapper function for computing antitarget locations""" - if self.libraryType == LibraryType.WGS: - return {} - return { - "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed".format(**wildcards), - } - - def _get_args_antitarget(self, wildcards): - params = {} - if self.name in self.config.tools: - params = { - "avg_size": self.config.cnvkit.antitarget_avg_size, - "min_size": self.config.cnvkit.min_size, + def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + if self.libraryType == LibraryType.WES: + args = { + "avg-size": self.cfg.target.avg_size, + "split": self.cfg.target.split, + "interval": self.path_baits, } - if self.config.cnvkit.get("access", "") != "": - params["access"] = self.config.cnvkit.get("access") - return params + else: + assert self.is_wgs, "Panel not implemented yet" + args = dict(input) | {"split": self.cfg.target.split} + if args.get("avg-size", None) is not None: + args["avg-size"] = self._read_autobin_output(args["avg-size"]) + elif self.cfg.target.avg_size is not None: + args["avg-size"] = self.cfg.target.avg_size + else: + args["avg-size"] = 5000 + if self.w_config.static_data_config.get("features", None): + args["annotate"] = self.w_config.static_data_config.features.path + args["short-names"] = self.cfg.target.short_names + return args + + def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: + input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} + if self.create_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + return input_files - def _get_output_files_antitarget(self): - return {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"} + def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "avg-size": self.cfg.antitarget.avg_size, + "min-size": self.cfg.antitarget.min_size, + } + if "access" not in args: + args["access"] = self.cfg.path_access + return args - def _get_input_files_coverage(self, wildcards): + def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: """Helper wrapper function for computing coverage""" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam" bam = ngs_mapping(tpl.format(**wildcards)) return { - "intervals": "work/{mapper}.cnvkit/out/cnvkit.{interval}.bed".format(**wildcards), + "intervals": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{region}.bed".format( + **wildcards + ), "bam": bam, "bai": bam + ".bai", } - def _get_args_coverage(self, wildcards): - params = {} - if self.name in self.config.tools: - params = { - "reference": self.w_config.static_data_config.reference.path, - "min_mapq": self.config.cnvkit.min_mapq, - } - if self.config.cnvkit.get("count", False): - params["count"] = True - return params - - def _get_output_files_coverage(self): - return { - "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", + def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "min-mapq": self.cfg.coverage.min_mapq, + "count": self.cfg.coverage.count, } - def _get_input_files_create_panel(self, wildcards): - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" + def _get_input_files_create_panel(self, wildcards: Wildcards) -> dict[str, str]: + tpl = self.base_out_lib + "target.cnn" targets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() ] if self.libraryType == LibraryType.WES: - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + tpl = self.base_out_lib + "antitarget.cnn" antitargets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) - for x in self.normal_libraries + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() ] else: antitargets = [] return {"normals": targets + antitargets} - def _get_args_create_panel(self, wildcards): - params = {} - if self.name in self.config.tools: - params = { - "reference": self.w_config.static_data_config.reference.path, - } - if self.config.cnvkit.get("cluster", False): - params["cluster"] = True - params["min_cluster_size"] = self.config.cnvkit.min_cluster_size - if self.config.cnvkit.get("sample_sex"): - params["sample_sex"] = self.config.cnvkit.sample_sex - if self.config.cnvkit.get("male_reference", False): - params["male_reference"] = True - if self.config.cnvkit.get("diploid_parx_genome", None): - params["diploid_parx_genome"] = self.config.cnvkit.get("diploid_parx_genome") - if not self.config.cnvkit.get("gc_correction", True): - params["no_gc"] = True - if not self.config.cnvkit.get("rmask_correction", True): - params["no_rmask"] = True - if self.config.cnvkit.get("edge_correction", None) is None: - if self.libraryType != LibraryType.WES: - params["no_edge"] = True - elif not self.config.cnvkit.get("edge_correction"): - params["no_edge"] = True - return params - - def _get_output_files_create_panel(self): - return {"panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"} - - -class AccessStepPart(PanelOfNormalsStepPart): - """Utility to create access file for cnvkit""" - - name = "access" - actions = ("run",) - - def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: - # Validate action - self._validate_action(action) - return ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="8G", - ) - - def get_input_files(self, action): - # Validate action - self._validate_action(action) - return None - - def get_output_files(self, action): - # Validate action - self._validate_action(action) - tpl = "work/access/out/access.bed" - return {"access": tpl, "access_md5": tpl + ".md5"} + def _get_args_create_panel(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "cluster": self.cfg.cluster, + "no-gc": not self.cfg.gc, + "no-rmask": not self.cfg.rmask, + "no-edge": not self.cfg.get("edge", self.is_wes), + "diploid-parx-genome": self.cfg.diploid_parx_genome, + } + if self.cfg.cluster: + args["min-cluster-size"] = self.cfg.min_cluster_size + sample_sex = self._get_cohort_sex() + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + args["male-reference"] = self.cfg.male_reference + return args + + def _get_input_files_sex(self, wildcards: Wildcards) -> dict[str, str]: + tpl = self.base_out_lib + "target.cnn" + coverages = [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + if self.is_wes: + tpl = self.base_out_lib + "antitarget.cnn" + coverages += [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + return {"coverages": coverages} - def get_args(self, action): - # Validate action - self._validate_action(action) - if self.name in self.config.tools: - return { - "reference": self.w_config.static_data_config.reference.path, - "min_gap_size": self.config.access.min_gap_size, - "exclude": self.config.access.exclude, - } - return {} + def _get_args_sex(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | {"diploid-parx-genome": self.cfg.diploid_parx_genome} - @classmethod - def get_log_file(cls, action): - """Return log files""" - assert action in cls.actions - return cls._get_log_file("work/access/log/access", has_sh=True) + def _read_autobin_output(self, filename: str) -> int: + nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" + pattern = re.compile("^Target:[ \t]+" + nb + "[ \t]+" + nb + "$") + with open(filename) as f: + for line in f: + m = pattern.match(line) + if m: + return int(float(m.groups()[4])) + return -1 class PanelOfNormalsWorkflow(BaseStep): @@ -852,7 +874,6 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ( Mutect2StepPart, CnvkitStepPart, - AccessStepPart, PureCnStepPart, LinkOutStepPart, ) @@ -879,39 +900,10 @@ def get_result_files(self): result_files.extend(self._expand_result_files(tpl, log_ext_list)) if "cnvkit" in set(self.config.tools) & set(TOOLS): - tpls = [ - ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed",)), - ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed",)), - ( - "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", - ("cnn",), - ), - # ( - # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", - # ("tsv", "tsv.md5"), - # ), - # ( - # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", - # ("tsv", "tsv.md5"), - # ), - ] - for tpl, ext_list in tpls: - result_files.extend(self._expand_result_files(tpl, ext_list)) - tpls = [ - "output/{mapper}.cnvkit/log/cnvkit.target.{ext}", - "output/{mapper}.cnvkit/log/cnvkit.antitarget.{ext}", - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", - ] - for tpl in tpls: - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) - # tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" - # result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) - - if "access" in set(self.config.tools) & set(TOOLS): - tpl = "output/access/out/access.bed" - result_files.extend([tpl + md5 for md5 in ("", ".md5")]) - tpl = "output/access/log/access.{ext}" - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) + cnvkit_files = self.sub_steps["cnvkit"].get_result_files() + for work in cnvkit_files: + output = work.replace("work/", "output/", 1) + result_files.extend(self._expand_result_files(output, ("",))) if "purecn" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.{ext}" @@ -923,13 +915,14 @@ def get_result_files(self): tpl = "output/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) tpl = "output/purecn/out/{}_{}.{{ext}}".format( - self.config.purecn.enrichment_kit_name, + # TODO: select enrichment kit + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ) ext_list = ("list", "bed.gz", "bed.gz.tbi") result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/purecn/log/{}_{}.{{ext}}".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ) result_files.extend(self._expand_result_files(tpl, log_ext_list)) diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index 802b5e417..59684d40d 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -3,7 +3,10 @@ from pydantic import Field -from snappy_pipeline.models import EnumField, KeepTmpdir, SnappyModel, SnappyStepModel, validators +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, Parallel, validators +from snappy_pipeline.models.common import LibraryKitEntry, Sex +from snappy_pipeline.models.cnvkit import CnvkitToReference as CnvkitGeneric +from snappy_pipeline.models.mutect2 import Mutect2 as Mutect2Generic class Tool(enum.StrEnum): @@ -13,122 +16,32 @@ class Tool(enum.StrEnum): access = "access" -class Mutect2(SnappyModel): +class Mutect2(Parallel, Mutect2Generic): path_normals_list: str = "" - - germline_resource: str + """Optional file listing libraries to include in panel""" java_options: str = " -Xmx16g" + """Optional java run-time options""" - num_cores: int = 2 - """number of cores to use locally""" - - window_length: int = 100000000 - """split input into windows of this size, each triggers a job""" - - num_jobs: int = 500 - """number of windows to process in parallel""" - - use_profile: bool = True - """use Snakemake profile for parallel processing""" - - restart_times: int = 5 - """number of times to re-launch jobs in case of failure""" - - max_jobs_per_second: int = 2 - """throttling of job creation""" - - max_status_checks_per_second: int = 10 - """throttling of status checks""" - - debug_trunc_tokens: int = 0 - """truncation to first N tokens (0 for none)""" - - keep_tmpdir: KeepTmpdir = KeepTmpdir.never - """keep temporary directory, {always, never, onerror}""" - job_mult_memory: float = 1 - """memory multiplier""" - - job_mult_time: float = 1 - """running time multiplier""" - - merge_mult_memory: float = 1 - """memory multiplier for merging""" - - merge_mult_time: float = 1 - """running time multiplier for merging""" - - -class CnvkitSex(enum.StrEnum): - MALE = "male" - FEMALE = "female" - - -class CnvKit(SnappyModel): - path_normals_list: str = "" +class CnvKit(CnvkitGeneric): + path_normals_list: str | None = None """Optional file listing libraries to include in panel""" - path_target_regions: str = "" - """Bed files of targetted regions (Missing when creating a panel of normals for WGS data)""" - - access: str = "" - """Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run)""" - - min_gap_size: int = 5000 - """[access] Minimum gap size between accessible regions""" - - target_avg_size: int | None = None - """[target] Average size of split target bins (None: use default value, or use autobin for wgs)""" - - split: bool = True - """[target] Split large intervals into smaller ones""" - - bp_per_bin: int = 50000 - """[autobin] Expected base per bin""" - - antitarget_avg_size: int = 0 - """[antitarget] Average size of antitarget bins (0: use default value)""" - - min_size: int = 0 - """[antitarget] Min size of antitarget bins (0: use default value)""" - - min_mapq: int = 0 - """[coverage] Mininum mapping quality score to count a read for coverage depth""" - - count: bool = False - """[coverage] Alternative couting algorithm""" - - min_cluster_size: int = 0 - """[reference] Minimum cluster size to keep in reference profiles. 0 for no clustering""" - - sample_sex: CnvkitSex | None = None - """[reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing""" - - male_reference: bool = False - """[reference & sex] Create male reference""" - - gc_correction: bool = True - """[reference] Use GC correction""" - - edge_correction: bool | None = None - """[reference] Use edge correction (automatic when None, edge correction for WES only)""" - - rmask_correction: bool = True - """[reference] Use rmask correction""" - - drop_low_coverage: bool = False - """[metrics] Drop very-low-coverage bins before calculations""" - + path_target_interval_list_mapping: list[LibraryKitEntry] = [] + """ + Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), + recommended by PureCN author + """ -class Access(SnappyModel): - """Creates access file for cnvkit, based on genomic sequence & excluded regions (optionally)""" + sample_sex: Sex = Sex() + """Sets the sex of all normals used in the panel""" - exclude: list[str] = [] - """[access] Bed file of regions to exclude (mappability, blacklisted, ...)""" + path_access: str | None = None + """Overrides access when not None""" - min_gap_size: int = 0 - """[access] Minimum gap size between accessible sequence regions (0: use default value)""" + ignore_chroms: list[str] = [] + """Additional contigs to ignore""" class GenomeName(enum.StrEnum): @@ -147,7 +60,7 @@ class PureCn(SnappyModel): path_normals_list: str = "" """Optional file listing libraries to include in panel""" - path_bait_regions: str + # targets_definition: list[LibraryKitEntry] = [] """ Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), recommended by PureCN author @@ -161,7 +74,7 @@ class PureCn(SnappyModel): EnumField(GenomeName, json_schema_extra={"options": {"unknown"}}), ] = "unknown" - enrichment_kit_name: str = "unknown" + path_target_interval_list_mapping: list[LibraryKitEntry] = [] """For filename only...""" mappability: str = "" @@ -212,6 +125,4 @@ class PanelOfNormals(SnappyStepModel, validators.ToolsMixin): cnvkit: CnvKit | None = None - access: Access | None = None - purecn: PureCn | None = None diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 84eba37a8..1e5a4d097 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -221,6 +221,10 @@ class SomaticCnvCallingStepPart(BaseStepPart): def __init__(self, parent: "SomaticCnvCallingWorkflow"): super().__init__(parent) + self.ignored = [] + if len(self.config.get("ignore_chroms", [])) > 0: + self.ignored += self.config.ignore_chroms + class CnvKitStepPart(SomaticCnvCallingStepPart): """Perform somatic targeted CNV calling using cnvkit""" @@ -248,6 +252,10 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): # Overwrite defaults default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h + resource_usage = { + "coverage": ResourceUsage(threads=8, time="11:59:59", memory="7680M"), + "segment": ResourceUsage(threads=8, time="11:59:59", memory="7680M"), + } def __init__(self, parent: SomaticCnvCallingStepPart): super().__init__(parent) @@ -273,6 +281,9 @@ def __init__(self, parent: SomaticCnvCallingStepPart): self.cfg: CnvkitConfig = self.config.get(self.name) self.pon_source = self.cfg.panel_of_normals.source + self.ignored += self.cfg.ignore_chroms + self.ignored = set(self.ignored) + self._set_cnvkit_pipeline_logic() self.path_baits = self._get_path_baits() @@ -285,7 +296,7 @@ def __init__(self, parent: SomaticCnvCallingStepPart): [x.purity is None for x in self.tumors.values()] ), "Missing purity value from samplesheet" - self.base_out = "work/{mapper}.cnvkit/out/cnvkit." + self.base_out = "work/{mapper}.cnvkit/out/{mapper}.cnvkit." self.base_out_lib = ( "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." ) @@ -440,7 +451,7 @@ def get_log_file(self, action): # Validate action self._validate_action(action) - base_log = "work/{mapper}.cnvkit/log/cnvkit." + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." if action in ("access", "antitarget"): @@ -484,6 +495,9 @@ def get_log_file(self, action): def get_result_files(self, library_name: str, mapper: str) -> list[str]: """Files to symlink to output""" + if not (self.is_wes or self.is_wgs): + return [] + base_out_lib = ( "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." ).format(mapper=mapper, library_name=library_name) @@ -554,6 +568,7 @@ def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, "reference": self.w_config.static_data_config.reference.path, "min-gap-size": self.cfg.access.min_gap_size, "exclude": self.cfg.access.exclude, + "ignore_chroms": list(self.ignored), } # ----- Autobin (never used directly, only to compute target size in WGS settings) ------------ @@ -666,7 +681,10 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: # BAM/BAI file ngs_mapping = self.parent.sub_workflows["ngs_mapping"] base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files = {"bam": ngs_mapping(base_path + ".bam")} + input_files = { + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), + } # Region (target or antitarget) file if self.build_ref: @@ -684,7 +702,9 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: input_files["intervals"] = self.base_out.format(**wildcards) + "{region}.bed" elif self.pon_source == PanelOfNormalsOrigin.COHORT: panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] - base_path = "output/{mapper}.cnvkit/out/cnvkit.{region}.bed" + base_path = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{region}.bed".format( + **wildcards + ) input_files["intervals"] = panel_of_normals(base_path) return input_files @@ -727,11 +747,10 @@ def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[s "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, "no-edge": not self.cfg.get("edge", self.is_wes), + "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.cluster: args["min-cluster-size"] = self.cfg.min_cluster_size - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome sample_sex = self._get_sample_sex(wildcards.get("library_name", None)) if sample_sex is not None: args["sample-sex"] = str(sample_sex) @@ -760,7 +779,9 @@ def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: input_files["reference"] = self.base_out.format(**wildcards) + "reference.cnn" elif self.pon_source == PanelOfNormalsOrigin.COHORT: panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] - base_path = "output/{mapper}.cnvkit/out/cnvkit.panel_of_normals.cnn" + base_path = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn".format( + **wildcards + ) input_files["reference"] = panel_of_normals(base_path) return input_files @@ -770,10 +791,9 @@ def _get_args_fix(self, wildcards: Wildcards, input: InputFiles) -> dict[str, st "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, "no-edge": not self.cfg.get("edge", self.is_wes), + "diploid-parx-genome": self.cfg.diploid_parx_genome, } args["sample-id"] = wildcards.library_name - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome if "reference" not in args: args["reference"] = self.cfg.panel_of_normals.path_panel_of_normals return args @@ -848,14 +868,13 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s "thresholds": self.cfg.call.thresholds, "drop-low-coverage": self.cfg.drop_low_coverage, "male-reference": self.cfg.male_reference, + "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.call.center_at is not None: args["center-at"] = self.cfg.call.center_at else: if self.cfg.call.center is not None: args["center"] = self.cfg.call.center - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome if self.cfg.somatic_variant_calling.enabled: args |= self._variants_args(wildcards, input) if "variants" not in args: @@ -968,9 +987,8 @@ def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict "alpha": self.cfg.genemetrics.alpha, "bootstrap": self.cfg.genemetrics.bootstrap, "stats": [x.replace("t-test", "ttest") for x in self.cfg.genemetrics.stats], + "diploid-parx-genome": self.cfg.diploid_parx_genome, } - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome sample_sex = self._get_sample_sex(wildcards.library_name) if sample_sex is not None: args["sample-sex"] = str(sample_sex) @@ -1126,14 +1144,16 @@ def _match_normals(self, valid_dna_libraries: list[LibraryInfo]) -> dict[str, st def _optionally_register_subworkflow(self, subworkflow: str): for tool in set(self.config.tools.wgs + self.config.tools.wes): - assert self.config.get(tool) is not None, f"Requested tool '{tool}' not configured" cfg = self.config.get(tool) - subworkflow_config = cfg.get(subworkflow) + subworkflow_config = cfg.get(subworkflow, None) if ( - subworkflow_config + subworkflow_config is not None and subworkflow_config.enabled and str(subworkflow_config.source) == "cohort" ): + assert ( + self.w_config.step_config.get(subworkflow, None) is not None + ), f"Upstream step {subworkflow} not configured" self.register_sub_workflow( subworkflow, subworkflow_config.get(f"path_{subworkflow}"), diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py index 2d3aef2d9..970c55fa1 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -5,8 +5,6 @@ from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel from snappy_pipeline.models.cnvkit import Cnvkit as CnvkitGeneric -from snappy_pipeline.models.purecn import IntervalFilter -from snappy_pipeline.models.purecn import Segmentation as PureCNSegmentation from snappy_pipeline.models.purecn import PureCN as PureCNBase from snappy_pipeline.models.purecn import Variant as PureCNVariantParams from snappy_pipeline.models.common import LibraryKitEntry, Sex @@ -252,6 +250,9 @@ def ensure_purity_not_auto(self): path_access: str | None = None """Overrides access when not None""" + ignore_chroms: list[str] = [] + """List of contig name patterns to ignore for processing""" + class SomaticCnvCalling(SnappyStepModel): path_ngs_mapping: str = "../ngs_mapping" @@ -264,3 +265,6 @@ class SomaticCnvCalling(SnappyStepModel): purecn: PureCN | None = None sequenza: Sequenza | None = None control_freec: ControlFreec | None = None + + ignore_chroms: list[str] = [] + """List of contig name patterns to ignore for processing""" diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 2c2d8faf8..4afbbaf99 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -10,6 +10,7 @@ base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) sys.path.insert(0, base_dir) +from snappy_wrappers.tools.genome_windows import ignore_chroms from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" @@ -17,6 +18,18 @@ args = snakemake.params.get("args", {}) +prefix = "" + +# Add the "ignore_chrom" contents to the excluded regions +if len(args.get("ignore_chroms", [])) > 0: + ignored_contigs = ignore_chroms(args["reference"], args["ignore_chroms"], return_ignored=True) + lines = ["cat << __EOF > $TMPDIR/ignore_chroms.bed"] + for (contig_name, contig_length) in ignored_contigs: + lines.append(f"{contig_name}\t0\t{contig_length}") + lines.append("__EOF") + prefix = "\n".join(lines) + "\n" + args["exclude"].append("$TMPDIR/ignore_chroms.bed") + cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ @@ -29,4 +42,4 @@ exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]), ) -CnvkitWrapper(snakemake, cmd).run() +CnvkitWrapper(snakemake, prefix + cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index c7009727f..5fb01d78b 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -20,8 +20,8 @@ cmd = r""" cnvkit.py antitarget \ -o {snakemake.output.antitarget} \ - --avg-size {args[avg-size]} {min_size} - --access {files[access]} \ + --avg-size {args[avg-size]} {min_size} \ + --access {args[access]} \ {args[target]} """.format( snakemake=snakemake, diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index 9711475ed..5020ac227 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -29,8 +29,8 @@ args=args, out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", - access=f"--access {args['access']}" if "access" in args else "", - target=f"--targets {args['target']}" if "target" in args else "", + access=f"--access {args['access']}" if args.get("access", None) is not None else "", + target=f"--targets {args['target']}" if args.get("target", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index e6bfbd2ae..7cf317aa6 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -36,7 +36,7 @@ -o {snakemake.output.calls} \ --method {args[method]} {thresholds} \ {filter} \ - {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} \ + {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} {diploid_parx_genome} \ {purity} {ploidy} \ {variants} \ {args[segments]} @@ -53,6 +53,7 @@ drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", male_reference=f"--male-reference" if args.get("male-reference", False) else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid_parx_genome']}" if args.get("diploid-parx-genome", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 7d843f650..6a33406da 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -39,9 +39,9 @@ no_gc="--no-gc" if args.get("no-gc", False) else "", no_edge="--no-edge" if args.get("no-edge", False) else "", no_rmask="--no-rmask" if args.get("no-rmask", False) else "", - min_cluster_size=f"--min-cluster-size {args['min-cluster-size']}" if "min-cluster-size" in args else "", - sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", - diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "" + min_cluster_size=f"--min-cluster-size {args['min-cluster-size']}" if args.get("min-cluster-size", None) is not None else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get("diploid-parx-genome", None) is not None else "" ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py index a50a32e7e..f9c370b36 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -27,8 +27,8 @@ args=args, drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", male_reference="--male-reference" if args.get("male-reference", False) else "", - sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", - diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get("diploid-parx-genome", None) is not None else "", stats=" ".join([f"--{stat}" for stat in args["stats"]]), ) diff --git a/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml b/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py new file mode 100644 index 000000000..1869d8040 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py sex""" + +import os +import re +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py sex \ + -o {snakemake.output.sex} \ + {diploid_parx_genome} \ + {coverages} +""".format( + snakemake=snakemake, + args=args, + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get('diploid-parx-genome', None) is not None else "", + coverages=" ".join(args["coverages"]), +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index a3f72f3b7..37e1bd9c2 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -26,9 +26,9 @@ """.format( snakemake=snakemake, args=args, - avg_size=f"--avg-size {args['avg-size']}" if args['avg-size'] is not None else "", + avg_size=f"--avg-size {args['avg-size']}" if args.get("avg-size", None) is not None else "", split=f"--split" if args.get("split", False) else "", - annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", + annotate=f"--annotate {args['annotate']}" if args.get("annotate", None) is not None else "", short_names="--short-names" if args.get("short-names", False) else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 86870f4c8..5ce31759d 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -38,21 +38,25 @@ def minimal_config(): path_index: /path/to/bwa/index.fa panel_of_normals: - tools: ['mutect2', 'cnvkit', 'access', 'purecn'] + tools: ['mutect2', 'cnvkit', 'purecn'] + ignore_chroms: [GL*] path_ngs_mapping: ../ngs_mapping mutect2: germline_resource: /path/to/germline_resource.vcf path_normals_list: "" cnvkit: - path_target_regions: "" + ignore_chroms: [MT] + path_target_interval_list_mapping: [] path_normals_list: "" + diploid_parx_genome: GRCh38 purecn: path_normals_list: "" - path_bait_regions: /path/to/baits/regions.bed + path_target_interval_list_mapping: + - name: panel + pattern: panel + path: /path/to/baits.bed path_genomicsDB: /path/to/mutect2/genomicsDB genome_name: "unknown" - access: - exclude: [/path/to/exclude.bed] data_sets: first_batch: @@ -76,12 +80,20 @@ def panel_of_normals_workflow( config_paths, cancer_sheet_fake_fs, aligner_indices_fake_fs, + autobin_result_pon_fake_fs, mocker, ): """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" + # Patch out file-system to enable reading autobin output + autobin_result_pon_fake_fs.fs.create_file( + file_path="work/bwa.cnvkit/out/bwa.cnvkit.autobin.txt", + contents="Target: -1 2000\n", + create_missing_dirs=True, + ) # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.panel_of_normals", autobin_result_pon_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -199,357 +211,6 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): assert actual == expected, msg_error -# Tests for CnvkitStepPart ------------------------------------------------------------------------ - - -def test_cnvkit_step_part_get_input_files_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_access()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - actual = panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards) - assert actual == {} - - -def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_access()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "bams": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", - ], - "access": "work/bwa.cnvkit/out/cnvkit.access.bed", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_target()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "access": "work/bwa.cnvkit/out/cnvkit.access.bed", - "avg_size": "work/bwa.cnvkit/out/cnvkit.autobin.txt", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_antitarget()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) - assert actual == {} - - -def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_coverage()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - "normal_library": "P001-N1-DNA1-WGS1", - "interval": "target", - } - ) - expected = { - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "intervals": "work/bwa.cnvkit/out/cnvkit.target.bed", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_create_panel()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "normals": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_access()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"reference": "/path/to/ref.fa", "min_gap_size": 5000} - actual = panel_of_normals_workflow.get_args("cnvkit", "access")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_autobin()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"method": "wgs", "bp_per_bin": 50000} - actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_target()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"annotate": "/path/to/annotations.gtf", "split": True} - actual = panel_of_normals_workflow.get_args("cnvkit", "target")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_antitarget()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"avg_size": 0, "min_size": 0} - actual = panel_of_normals_workflow.get_args("cnvkit", "antitarget")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_coverage()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"reference": "/path/to/ref.fa", "min_mapq": 0} - actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_create_panel(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_create_panel()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"reference": "/path/to/ref.fa", "no_edge": True} - actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_target()""" - expected = { - "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_antitarget()""" - expected = { - "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_coverage()""" - expected = { - "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", - "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_create_panel()""" - expected = { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_target()""" - base_name_out = "work/{mapper}.cnvkit/log/cnvkit.target" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.target.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_antitarget()""" - base_name_out = "work/{mapper}.cnvkit/log/cnvkit.antitarget" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.antitarget.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_coverage()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_create_panel()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): - """Tests CvnkitStepPart.get_resource_usage()""" - # Define expected: default defined workflow.abstract - target_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - antitarget_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - coverage_expected_dict = { - "threads": 8, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - reference_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - - # Evaluate action `target` - for resource, expected in target_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'target'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource)() - assert actual == expected, msg_error - - # Evaluate action `antitarget` - for resource, expected in antitarget_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource)() - assert actual == expected, msg_error - - # Evaluate action `coverage` - for resource, expected in coverage_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource)() - assert actual == expected, msg_error - - # Evaluate action `create_panel` - for resource, expected in reference_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() - assert actual == expected, msg_error - - -# Tests for AccessStepPart ------------------------------------------------------------------------- - - -def test_access_step_part_get_input_files_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_input_files_run()""" - assert panel_of_normals_workflow.get_input_files("access", "run") is None - - -def test_access_step_part_get_args_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_args_run()""" - expected = { - "reference": "/path/to/ref.fa", - "exclude": ["/path/to/exclude.bed"], - "min_gap_size": 0 - } - actual = panel_of_normals_workflow.get_args("access", "run") - assert actual == expected - - -def test_access_step_part_get_output_files_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_output_files_run()""" - expected = { - "access": "work/access/out/access.bed", - "access_md5": "work/access/out/access.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("access", "run") - assert actual == expected - - -def test_access_step_part_get_log_file_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_log_file_run()""" - expected = get_expected_log_files_dict(base_out="work/access/log/access") - expected["sh"] = "work/access/log/access.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("access", "run") - assert actual == expected - - -def test_access_step_part_get_resource_usage(panel_of_normals_workflow): - """Tests AccessStepPart.get_resource_usage()""" - # Define expected: default defined workflow.abstract - expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'run'." - actual = panel_of_normals_workflow.get_resource("access", "run", resource)() - assert actual == expected, msg_error - - # Tests for PureCnStepPart ------------------------------------------------------------------------- @@ -577,12 +238,12 @@ def test_purecn_step_part_get_input_files_prepare(panel_of_normals_workflow): def test_purecn_step_part_get_output_files_prepare(panel_of_normals_workflow): """Tests PureCnStepPart._get_output_files_prepare()""" expected = { - "intervals": "work/purecn/out/unknown_unknown.list", - "optimized": "work/purecn/out/unknown_unknown.bed.gz", - "tbi": "work/purecn/out/unknown_unknown.bed.gz.tbi", - "intervals_md5": "work/purecn/out/unknown_unknown.list.md5", - "optimized_md5": "work/purecn/out/unknown_unknown.bed.gz.md5", - "tbi_md5": "work/purecn/out/unknown_unknown.bed.gz.tbi.md5", + "intervals": "work/purecn/out/panel_unknown.list", + "optimized": "work/purecn/out/panel_unknown.bed.gz", + "tbi": "work/purecn/out/panel_unknown.bed.gz.tbi", + "intervals_md5": "work/purecn/out/panel_unknown.list.md5", + "optimized_md5": "work/purecn/out/panel_unknown.bed.gz.md5", + "tbi_md5": "work/purecn/out/panel_unknown.bed.gz.tbi.md5", } actual = panel_of_normals_workflow.get_output_files("purecn", "prepare") assert actual == expected @@ -590,7 +251,7 @@ def test_purecn_step_part_get_output_files_prepare(panel_of_normals_workflow): def test_purecn_step_part_get_log_file_prepare(panel_of_normals_workflow): """Tests PureCnStepPart._get_log_file_prepare()""" - expected = get_expected_log_files_dict(base_out="work/purecn/log/unknown_unknown") + expected = get_expected_log_files_dict(base_out="work/purecn/log/panel_unknown") actual = panel_of_normals_workflow.get_log_file("purecn", "prepare") assert actual == expected @@ -605,7 +266,7 @@ def test_purecn_step_part_get_input_files_coverage(panel_of_normals_workflow): ) expected = { "container": "work/containers/out/purecn.simg", - "intervals": "work/purecn/out/unknown_unknown.list", + "intervals": "work/purecn/out/panel_unknown.list", "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", } actual = panel_of_normals_workflow.get_input_files("purecn", "coverage")(wildcards) @@ -681,13 +342,162 @@ def test_purecn_step_part_get_resource_usage(panel_of_normals_workflow): assert actual == value +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + actual = panel_of_normals_workflow.get_args("cnvkit", "access")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards), + ) + if actual.get("ignore_chroms", None) is not None: + actual["ignore_chroms"].sort() + expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} + assert actual == expected + + +def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "bams": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", + ], + "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + "method": "wgs", + "bp-per-bin": 50000, + } + actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_target()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + "avg-size": 2000, + "split": True, + "annotate": "/path/to/annotations.gtf", + "short-names": True, + + } + actual = panel_of_normals_workflow.get_args("cnvkit", "target")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + "region": "target", + } + ) + expected = { + "intervals": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "reference": "/path/to/ref.fa", + "min-mapq": 0, + "count": False, + } + actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "normals": [ + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.target.cnn", + ], + "reference": "/path/to/ref.fa", + "cluster": False, + "no-gc": False, + "no-edge": True, + "no-rmask": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + } + actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_parts_get_output_files(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_output_files() for all actions""" + actions = { + "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, + "autobin": {"result": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.autobin.txt"}, + "target": {"target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed"}, + "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "create_panel": {"reference": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"}, + "sex": {"sex": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.sex.tsv"}, + } + for action, result in actions.items(): + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_output_files("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_log_file() for all actions""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + actions = ("autobin", "target", "create_panel", "sex") + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit" + for action in actions: + result = {k: base_log + f".{action}.{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_log_file("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_access(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_log_file() for access""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_log_file("cnvkit", "access") + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_log_file() for coverage""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + # PanelOfNormalsWorkflow -------------------------------------------------------------------------- def test_panel_of_normals_workflow(panel_of_normals_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["access", "cnvkit", "link_out", "mutect2", "purecn"] + expected = ["cnvkit", "link_out", "mutect2", "purecn"] actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) assert actual == expected @@ -711,37 +521,20 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += get_expected_log_files_dict(base_out=tpl.format(mapper=mapper)).values() # Now for basic cnvkit files (panel of normal only) - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}{chksum}" expected += [ - tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) + tpl.format(mapper=mapper, substep=substep, ext=ext, chksum=chksum) + for chksum in ("", ".md5") + for (substep, ext) in (("panel_of_normals", "cnn"), ("sex", "tsv"), ("target", "bed")) + for mapper in ("bwa",) ] - tpl = "output/{mapper}.cnvkit/out/cnvkit.{substep}.{ext}" - for substep in ("target", "antitarget"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("bed", "bed.md5") - for mapper in ("bwa",) - ] # add log files - tpl = "output/{mapper}.cnvkit/log/cnvkit.{substep}" - for substep in ("target", "antitarget"): + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" + for substep in ("create_panel", "sex", "target"): for mapper in ("bwa",): base_out = tpl.format(mapper=mapper, substep=substep) expected += get_expected_log_files_dict(base_out=base_out).values() expected += [base_out + ".sh", base_out + ".sh.md5"] - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" - for mapper in ("bwa",): - base_out = tpl.format(mapper=mapper, substep=substep) - expected += get_expected_log_files_dict(base_out=base_out).values() - expected += [base_out + ".sh", base_out + ".sh.md5"] - - # Access - tpl = "output/access/out/access.{ext}" - expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] - expected += get_expected_log_files_dict( - base_out="output/access/log/access" - ).values() - expected += ["output/access/log/access.sh", "output/access/log/access.sh.md5"] # PureCN tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.rds{chksum}" @@ -755,14 +548,14 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += get_expected_log_files_dict( base_out="output/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals".format(mapper="bwa") ).values() - tpl = "output/purecn/out/unknown_unknown.{ext}{chksum}" + tpl = "output/purecn/out/panel_unknown.{ext}{chksum}" expected += [ tpl.format(ext=ext, chksum=chksum) for ext in ("list", "bed.gz", "bed.gz.tbi") for chksum in ("", ".md5") ] expected += get_expected_log_files_dict( - base_out="output/purecn/log/unknown_unknown".format() + base_out="output/purecn/log/panel_unknown".format() ).values() expected = list(sorted(expected)) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index fe954b333..e9e7daa42 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -50,10 +50,12 @@ def minimal_config(): b_af_loci: /path/to/locii.bed somatic_cnv_calling: + ignore_chroms: [GL*] tools: wgs: ['cnvkit'] path_ngs_mapping: ../ngs_mapping cnvkit: + ignore_chroms: [MT] diploid_parx_genome: GRCh38 panel_of_normals: source: paired @@ -92,14 +94,14 @@ def somatic_cnv_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, - autobin_result_fake_fs, + autobin_result_calling_fake_fs, purity_result_fake_fs, aligner_indices_fake_fs, mocker, ): """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" # Patch out file-system to enable reading autobin output - autobin_result_fake_fs.fs.create_file( + autobin_result_calling_fake_fs.fs.create_file( file_path="work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.autobin.txt", contents="Target: -1 2000\n", create_missing_dirs=True, @@ -111,7 +113,7 @@ def somatic_cnv_calling_workflow( create_missing_dirs=True, ) # Patch out file-system related things in abstract (the crawling link in step is defined there) - patch_module_fs("snappy_pipeline.workflows.somatic_cnv_calling", autobin_result_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.somatic_cnv_calling", autobin_result_calling_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we @@ -146,7 +148,9 @@ def test_cnvkit_step_part_get_args_access(somatic_cnv_calling_workflow): wildcards, somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards), ) - expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": []} + if actual.get("ignore_chroms", None) is not None: + actual["ignore_chroms"].sort() + expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} assert actual == expected @@ -160,7 +164,7 @@ def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): ) expected = { "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], - "access": "work/bwa.cnvkit/out/cnvkit.access.bed", + "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", "method": "wgs", "bp-per-bin": 50000, } @@ -180,7 +184,7 @@ def test_cnvkit_step_part_get_args_target(somatic_cnv_calling_workflow): } ) expected = { - "interval": "work/bwa.cnvkit/out/cnvkit.access.bed", + "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", "avg-size": 2000, "split": True, "annotate": "/path/to/annotations.gtf", @@ -206,6 +210,7 @@ def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): expected = { "intervals": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.bed", "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", "reference": "/path/to/ref.fa", "min-mapq": 0, "count": False, @@ -453,10 +458,10 @@ def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): """Tests CnvkitStepPart.get_output_files() for all actions""" actions = { - "access": {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"}, + "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, "autobin": {"result": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt"}, "target": {"target": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.target.bed"}, - "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"}, + "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, "reference": {"reference": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.reference.cnn"}, "fix": {"ratios": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr"}, @@ -492,7 +497,7 @@ def test_cnvkit_step_parts_get_log_file(somatic_cnv_calling_workflow): def test_cnvkit_step_parts_get_log_file_access(somatic_cnv_calling_workflow): """Tests CnvkitStepPart.get_log_file() for access""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - base_log = "work/{mapper}.cnvkit/log/cnvkit.access" + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" result = {k: base_log + f".{v}" for k, v in exts} expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "access") From 964c408baba6517824bb294716c54abd0ae756f7 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 28 Nov 2024 18:32:21 +0100 Subject: [PATCH 18/46] tests: removed cnvkit from older cnv calling steps --- .../somatic_cnv_checking/__init__.py | 27 ++--- .../workflows/somatic_cnv_checking/model.py | 23 +--- .../test_workflows_somatic_cnv_checking.py | 29 +++-- ...kflows_somatic_targeted_seq_cnv_calling.py | 102 +++++++++--------- .../test_workflows_somatic_wgs_cnv_calling.py | 88 +++++++-------- 5 files changed, 125 insertions(+), 144 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py index febda8197..807a2f91d 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py @@ -72,10 +72,7 @@ ResourceUsage, ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow -from snappy_pipeline.workflows.somatic_targeted_seq_cnv_calling import ( - SomaticTargetedSeqCnvCallingWorkflow, -) -from snappy_pipeline.workflows.somatic_wgs_cnv_calling import SomaticWgsCnvCallingWorkflow +from snappy_pipeline.workflows.somatic_cnv_calling import SomaticCnvCallingWorkflow from .model import SomaticCnvChecking as SomaticCnvCheckingConfigModel @@ -294,23 +291,14 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) workdir, config_model_class=SomaticCnvCheckingConfigModel, previous_steps=( - SomaticTargetedSeqCnvCallingWorkflow, - SomaticWgsCnvCallingWorkflow, + SomaticCnvCallingWorkflow, NgsMappingWorkflow, ), ) - if self.config.path_cnv_calling and self.config.cnv_assay_type: - if self.config.cnv_assay_type == "WES": - cnv_calling = "somatic_targeted_seq_cnv_calling" - elif self.config.cnv_assay_type == "WES": - cnv_calling = "somatic_wgs_cnv_calling" - else: - raise InvalidConfiguration( - "Illegal cnv_assay_type {}, must be either WES or WGS".format( - self.config.cnv_assay_type - ) - ) - self.register_sub_workflow(cnv_calling, self.config.path_cnv_calling, "cnv_calling") + if self.config.path_cnv_calling: + self.register_sub_workflow( + "somatic_cnv_calling", self.config.path_cnv_calling, "cnv_calling" + ) self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -367,8 +355,9 @@ def get_result_files(self): ext = {"out": [".vcf.gz", ".vcf.gz.tbi"]} if self.config.path_cnv_calling: # CNV avaliable + # TODO: make the tool library-dependent (supporting both wes & wgs) name_pattern = "{mapper}.{caller}.{library_name}" - callers = self.w_config.step_config["somatic_targeted_seq_cnv_calling"].tools + callers = self.w_config.step_config["somatic_cnv_calling"].tools.wgs ext["out"] += [".tsv"] ext["report"] = (".cnv.pdf", ".locus.pdf", ".segment.pdf") ext["log"] = [ diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/model.py b/snappy_pipeline/workflows/somatic_cnv_checking/model.py index f7ec574a5..704b9aceb 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/model.py @@ -1,27 +1,14 @@ -import enum from typing import Annotated -from pydantic import Field, model_validator +from pydantic import Field from snappy_pipeline.models import SnappyStepModel -class CnvAssayType(enum.StrEnum): - WES = "WES" - WGS = "WGS" - - class SomaticCnvChecking(SnappyStepModel): path_ngs_mapping: str = "../ngs_mapping" - path_cnv_calling: Annotated[str, Field(examples=["../somatic_targeted_seq_cnv_calling"])] = "" - - cnv_assay_type: CnvAssayType | None = None - """ - Empty: no CNV, - WES for somatic_targeted_seq_snv_calling step, - WGS for somatic_wgs_cnv_calling step - """ + path_cnv_calling: Annotated[str, Field(examples=["../somatic_cnv_calling"])] = "" excluded_regions: str = "" """Bed file of regions to be excluded""" @@ -34,9 +21,3 @@ class SomaticCnvChecking(SnappyStepModel): min_baf: Annotated[float, Field(0.4, ge=0, le=0.5)] """Maximum BAF to consider variant as heterozygous (between 0 & 1/2)""" - - @model_validator(mode="after") - def ensure_cnv_assay_type_is_specified(self): - if self.path_cnv_calling and not self.cnv_assay_type: - raise ValueError("CNV assay type must be specified") - return self diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py index 733b69284..ed024080d 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py @@ -12,7 +12,7 @@ from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict from .conftest import patch_module_fs -__author__ = "Manuel Holtgrewe " +__author__ = "Eric Blanc " @pytest.fixture(scope="module") # otherwise: performance issues @@ -35,17 +35,28 @@ def minimal_config(): bwa: path_index: /path/to/bwa/index.fa - somatic_targeted_seq_cnv_calling: - tools: ["cnvkit"] + somatic_cnv_calling: + tools: + wgs: ["cnvkit"] cnvkit: - path_target: DUMMY - path_antitarget: DUMMY - path_panel_of_normals: DUMMY + diploid_parx_genome: GRCh38 + panel_of_normals: + source: paired + somatic_variant_calling: + enabled: False + source: cohort + tool: mutect2 + path_somatic_variant_calling: ../somatic_variant_calling + somatic_purity_ploidy_estimate: + enabled: False + source: cohort + tool: ascat + segment: + threshold: 0.0001 somatic_cnv_checking: path_ngs_mapping: ../ngs_mapping - path_cnv_calling: ../somatic_targeted_seq_cnv_calling - cnv_assay_type: WES + path_cnv_calling: ../somatic_cnv_calling data_sets: first_batch: @@ -71,7 +82,7 @@ def somatic_cnv_checking_workflow( aligner_indices_fake_fs, mocker, ): - """Return SomaticTargetedSeqCnvCallingWorkflow object pre-configured with germline sheet""" + """Return SomaticCnvCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index e49b5350a..815acdf73 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -41,14 +41,14 @@ def minimal_config(): somatic_targeted_seq_cnv_calling: tools: - cnvetti_on_target - - cnvkit + # - cnvkit - copywriter - sequenza - purecn - cnvkit: - path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + # cnvkit: + # path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + # path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + # path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn purecn: path_container: /path/to/purecn/container path_intervals: /path/to/interval/list @@ -1107,52 +1107,52 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call ) ] # cnvkit - tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="cnvkit") + "{ext}{md5}" - expected += [ - tpl.format(i=i, t=t, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ( - ".cnr", - "_dnacopy.seg", - ".bed.gz", - ".bed.gz.tbi", - ".seg", - ".vcf.gz", - ".vcf.gz.tbi", - ) - for md5 in ("", ".md5") - ] - tpl = ( - f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - + ".{plot}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - for md5 in ("", ".md5") - ] - tpl = ( - f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - + ".{plot}.chr{chrom}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) - for chrom in chain(range(1, 23), ("X", "Y")) - for md5 in ("", ".md5") - ] - tpl = ( - f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - + ".{report}.txt{md5}" - ) - expected += [ - tpl.format(i=i, t=t, report=report, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - for md5 in ("", ".md5") - ] + # tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="cnvkit") + "{ext}{md5}" + # expected += [ + # tpl.format(i=i, t=t, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for ext in ( + # ".cnr", + # "_dnacopy.seg", + # ".bed.gz", + # ".bed.gz.tbi", + # ".seg", + # ".vcf.gz", + # ".vcf.gz.tbi", + # ) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") + # + ".{plot}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") + # + ".{plot}.chr{chrom}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) + # for chrom in chain(range(1, 23), ("X", "Y")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") + # + ".{report}.txt{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, report=report, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + # for md5 in ("", ".md5") + # ] # copywriter tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="copywriter") + "_{ext}{md5}" expected += [ diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py index 40038a846..e09b8a349 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py @@ -47,17 +47,17 @@ def minimal_config(): - canvas - cnvetti - control_freec - - cnvkit + # - cnvkit tools_ngs_mapping: - bwa canvas: path_reference: /path/to/reference.fasta path_filter_bed: /path/to/filter.bed path_genome_folder: /path/to/genome/folder - cnvkit: - path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + # cnvkit: + # path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + # path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + # path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn cnvetti: {} control_freec: path_chrlenfile: /path/to/chrlenfile @@ -806,45 +806,45 @@ def test_somatic_cnv_calling_workflow(somatic_wgs_cnv_calling_workflow): for mapper in ("bwa",) for cnv_caller in ("control_freec",) ] - # -- add files from cnvkit - tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" - expected += [ - tpl.format(i=i, t=t, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") - for md5 in ("", ".md5") - ] - tpl = ( - "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" - "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - for md5 in ("", ".md5") - ] - tpl = ( - "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" - "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) - for chrom in chain(range(1, 23), ("X", "Y")) - for md5 in ("", ".md5") - ] - tpl = ( - "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" - "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" - ) - expected += [ - tpl.format(i=i, t=t, report=report, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - for md5 in ("", ".md5") - ] + # # -- add files from cnvkit + # tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" + # expected += [ + # tpl.format(i=i, t=t, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") + # for md5 in ("", ".md5") + # ] + # tpl = ( + # "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + # "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + # "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) + # for chrom in chain(range(1, 23), ("X", "Y")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + # "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, report=report, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + # for md5 in ("", ".md5") + # ] # Perform the comparison expected = list(sorted(expected)) actual = list(sorted(somatic_wgs_cnv_calling_workflow.get_result_files())) From 46eb954e3877f8bb51ac4e78f840483097c1e423 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 5 Dec 2024 18:38:02 +0100 Subject: [PATCH 19/46] refactor: all files moved to snakemake.input, filenames in sync with cnvkit (as much as possible), added to documentation --- snappy_pipeline/models/cnvkit.py | 20 +- .../workflows/panel_of_normals/Snakefile | 38 + .../workflows/panel_of_normals/__init__.py | 264 ++++-- .../workflows/panel_of_normals/model.py | 16 +- .../workflows/somatic_cnv_calling/__init__.py | 242 +++-- .../somatic_cnv_calling/cnvkit.rules | 21 + snappy_wrappers/tools/chromosome_lengths.py | 850 ++++++++++++++++++ snappy_wrappers/tools/genome_windows.py | 75 +- .../wrappers/cnvkit/access/wrapper.py | 20 +- .../wrappers/cnvkit/antitarget/wrapper.py | 5 +- .../wrappers/cnvkit/autobin/wrapper.py | 6 +- .../wrappers/cnvkit/bintest/wrapper.py | 4 +- .../wrappers/cnvkit/call/wrapper.py | 12 +- .../wrappers/cnvkit/coverage/wrapper.py | 4 +- .../wrappers/cnvkit/fix/wrapper.py | 13 +- .../wrappers/cnvkit/ignore/environment.yaml | 1 + .../wrappers/cnvkit/ignore/wrapper.py | 33 + .../wrappers/cnvkit/plot/scatter/wrapper.py | 36 +- .../wrappers/cnvkit/reference/wrapper.py | 11 +- .../cnvkit/report/genemetrics/wrapper.py | 4 +- .../wrappers/cnvkit/report/metrics/wrapper.py | 6 +- .../cnvkit/report/segmetrics/wrapper.py | 4 +- .../wrappers/cnvkit/segment/wrapper.py | 8 +- .../wrappers/cnvkit/sex/wrapper.py | 2 +- .../wrappers/cnvkit/target/wrapper.py | 4 +- .../test_workflows_panel_of_normals.py | 164 ++-- .../test_workflows_somatic_cnv_calling.py | 326 ++++--- 27 files changed, 1766 insertions(+), 423 deletions(-) create mode 100644 snappy_wrappers/tools/chromosome_lengths.py create mode 120000 snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 8f9de72df..2063d52f7 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -182,8 +182,12 @@ class Call(SnappyModel): When this parameter is set, the centering method should be left empty. """ - filter: FilterMethod | None = None - """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" + filter: list[FilterMethod] | None = None + """ + Merge segments flagged by the specified filter(s) with the adjacent segment(s). + + When ``None``, ``segmetrics`` enabled & ``smooth_bootstrap`` is None, the behaviour is identical to ``batch``: filtering is done using ``ci``. + """ @model_validator(mode="after") def ensure_center_without_center_at(self) -> Self: @@ -234,8 +238,8 @@ class PlotScatter(Plot): """y-axis upper limit.""" y_min: float | None = None """y-axis lower limit.""" - fig_size: tuple[float, float] = (6.4, 4.8) - """Width and height of the plot in inches.""" + fig_size: tuple[float, float] = (12.256, 16.192) + """Width and height of the plot in centimeters (might depend on the locale).""" @model_validator(mode="after") def ensure_range_list_with_gene(self) -> Self: @@ -268,8 +272,12 @@ class ReportSegmetrics(Report): """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" bootstrap: int = 100 """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" - smooth_bootstrap: bool = False - """Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval""" + smooth_bootstrap: bool = True + """ + Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval + + This is _NOT_ the ``cnvkit`` default, but it is automatically set in ``batch`` mode. + """ stats: list[ReportStats] = [ ReportStats.MEAN, ReportStats.MEDIAN, diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index e311589c3..fb5c8b6a0 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -104,7 +104,27 @@ rule panel_of_normals_mutect2_create_panel: # Write out the normals-only results for the normals -------------------------- +rule panel_of_normals_cnvkit_ignore: + input: + unpack(wf.get_input_files("cnvkit", "ignore")), + output: + **wf.get_output_files("cnvkit", "ignore"), + threads: wf.get_resource("cnvkit", "ignore", "threads") + resources: + time=wf.get_resource("cnvkit", "ignore", "time"), + memory=wf.get_resource("cnvkit", "ignore", "memory"), + partition=wf.get_resource("cnvkit", "ignore", "partition"), + log: + **wf.get_log_file("cnvkit", "ignore"), + params: + **{"args": wf.get_args("cnvkit", "ignore")}, + wrapper: + wf.wrapper_path("cnvkit/ignore") + + rule panel_of_normals_cnvkit_access: + input: + unpack(wf.get_input_files("cnvkit", "access")), output: **wf.get_output_files("cnvkit", "access"), threads: wf.get_resource("cnvkit", "access", "threads") @@ -231,6 +251,24 @@ rule panel_of_normals_cnvkit_sex: wf.wrapper_path("cnvkit/sex") +rule panel_of_normals_cnvkit_metrics: + input: + unpack(wf.get_input_files("cnvkit", "metrics")), + output: + **wf.get_output_files("cnvkit", "metrics"), + threads: wf.get_resource("cnvkit", "metrics", "threads") + resources: + time=wf.get_resource("cnvkit", "metrics", "time"), + memory=wf.get_resource("cnvkit", "metrics", "memory"), + partition=wf.get_resource("cnvkit", "metrics", "partition"), + log: + **wf.get_log_file("cnvkit", "metrics"), + params: + **{"args": wf.get_args("cnvkit", "metrics")}, + wrapper: + wf.wrapper_path("cnvkit/report/metrics") + + # Panel of normals (PureCN) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index b919fd480..4538a2ad7 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -44,10 +44,25 @@ Notes on the ``cnvkit`` workflow ================================ +-------- +Overview +-------- + ``cnvkit`` is a set of tools originally designed to call somatic copy number alterations from exome data. Its design is modular, which enables its use for whole genome and amplicon data. -Provided that sufficient normal samples are available, the ``cnvkit`` `documentation `_ +Because it was designed primarily for whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), +and the _antitargets_ (those regions accessible for CNV calling, but outside of enrichment). +The coverage of _targets_ and _antitargets_ are expected to be very different, +but there is still information to be gained in the _antitarget_ regions, +albeit at a much lower resolution than for _target_ regions. + +For WGS data, the _target_ regions generally cover the whole accessible genome, with empty _antitarget_ regions. + +.. tip:: For Agilent kits, the ``cnvkit`` authors recommend to use baits as targets. Baits are in the ``*_Covered.bed`` file. + +Provided that sufficient normal samples are available (`10 to 20 are considered sufficient `_), +the ``cnvkit`` `documentation `_ recommends the creation of a panel of normal (called ``reference``) for exome and whole genome data. .. note:: @@ -56,56 +71,48 @@ The actual workflow to generate this reference is slightly different between exome and whole genome data. The current implementation recapitulates the common practice, while still dispaching computations on multiple cluster nodes. ------------ -Access file ------------ - -``cnvkit`` can use a bed file describing the accessible regions for coverage computations. -The ``cnvkit`` distribution provides it for the ``GRCh37`` human genome release, but incompletely only for ``GRCh38``. -Therefore, a tentative ``access`` tool has been added, to generate this bed file when the user knows which locii should be excluded from coverage. -Its output (``output/cnvkit.access/out/cnvkit.access.bed``) is optional, but its presence impacts of the way the target and antitarget regions are computed in whole genome mode. - -.. note:: - - In a nutshell, for exome data, the accessibility file is only used to create antitarget regions. - These regions are essentially the accessible regions minus the target regions (with edge effect correction). +--------------------------------- +Regions accessible to CNV calling +--------------------------------- -Access files can be generated from the genome reference ``fasta`` file, and optionally ``bed`` file(s) containing regions to exclude from further computations. -In this case, the user must proceed in two steps: +``cnvkit`` needs to know about the regions of the genome accessible to CNV calling. +Typically, regions masked with ``N`` are excluded, but a user may also want to exclude +repeats, segmental duplications, or low complexity regions. -First, she needs to run the ``access`` tool to create the desired access file +There are multiple ways to get this information: -.. code-block:: yaml +1. The user can provide a ``bed`` file detailing accessible regions, using the ``path_access`` option in the ``access`` part of the configuration. +2. The user can specifically exclude regions using the ``exclude`` option is the ``access`` part of the configuration. + In this case, the pipeline will create an accessible regions file from the whole genome and the excluded parts. +3. Otherwise, the pipeline creates the accessible regions file from the reference genome sequence, by removing + only parts masked with ``N``. - panel_of_normals: - tools: [access] - access: - exclude: +.. note:: An additional constraints to pick option 3 is that the ``min_gap_size`` parameter cannot be set. -This will create ``output/cnvkit.access/out/cnvkit.access.bed`` from the genomic sequence & excluded regions. +.. note:: + Some short contigs (the mitochondrion, unplaced & unlocalized contigs, decoys, viral sequences, ...) are too short + for a reliable estimation of copy number changes. + ``snappy`` provides a generic way to ignore those contigs during processing (post-mapping), through the ``ignore_chroms`` configuration option. + This parameter is generally set at the _step_ level, typically ignoring decoys, HLA variants and viral sequences. + This is suitable for small variant calling (calling variants for genes on unplaced/unlocalized contigs is fine), + but not for CNA calling. + Therefore, ``ignore_chroms`` can also be set at the _tool_ level in the configuration. In that case, contigs from both options will be ignored. + Contigs ignored during panel of normals creation will _not_ be assessed during calling. -When there are no exclusion regions, the access file is automatically created using only the reference genome, and removing masked regions. +.. note:: In WES mode, using ``ignore_chroms`` options is generally not necessary, unless the baits definition includes small contigs. ------------------------ Panel of normal creation ------------------------ -If the user wants to create her own access file, then the panel of normal can only be created after the ``access`` tool has been run. -If she decides that the access file provided in the ``cnvkit`` distribution is suitable (no excluded region), -then she can skip the ``access`` tool step and directly creates her panel of normals. - -In both cases, the configuration might read: +By default, the panel is built using all normal samples in the cohort. +However, it is possible to select a sub-set of samples using the ``path_normals_list`` configruation option. +This is the path to a file with one library name per line. -.. code-block:: yaml +The current implementation doesn't allow for mixing WES & WGS data, not mixing multiple exome enrichment kits. +The selection of enrichment kit is done through the ``path_target_interval_list_mapping`` option. - panel_of_normals: - tools: [cnvkit] # , access] - access: # Even when created by the ``access`` tool. - path_target_regions: # Keep empty for WGS data - path_normals_list: # Keep empty to use all available normals - -Note that there is no provision (yet) to automatically create separate panel of normals for males & females. -If the number of samples collected in the same fashion is large enough, it is nevertheless the way to achieve best results. +TODO: implement support for cohorts collected using different enrichment kits (mixing WGS & WES is more difficult, and may never happen) ------- Reports @@ -135,12 +142,54 @@ To create the target regions from the baits (or from the accessible regions), the target average bin size must be set. There is a reasonable default value for exome data, but an additional ``autobin`` step is required for the whole genome data. In ``batch`` mode, this value is computed from the coverage over the full genome + .. note:: The ``cnvkit batch`` command also allows the creation of a flat reference, when there are no normal samples. This is not implemented in the ``panel_of_normals`` step, for obvious reasons. Using a flat reference for CNV computations is nevertheless possible, it is implemented in the ``somatic_cnv_calling`` step. +``cnvkit`` can infer the sex of a donor from a sample's coverage over sex chromosomes. +The decision is taken by the ratio of _G_ test statistics over the autosomes & sex chromosome coverage. +More precisely, the _G_ statistic is computed (with Yates continuity correction) from the contingency table built using +the number of bins with coverage higher or lower than the grand median, from the autosomes and the sex chromosome. + +.. math:: + G(x) = G \\left( \\begin{array}{cc} N(c_a > m) & N(c_x > m) \\\\ N(c_a < m) & N(c_x < m) \\end{array} \\right) + +where :math:`c_a` the coverage over the autosomes :math:`c_x` the coverage over the sex chromosome `x` & :math:`m = \\text{median}(c_a, c_x)`. +The coverages are defined as the base-2 logarithm of the coverage depth. + +The final score is obtained after shifting the coverages by 0, 1 or 3, depending on the case. + +.. math:: + \\text{score} = G(X)/G(X+1) \\cdot G(Y+3)/G(Y) + +When the score is higher than 1, the sex is inferred as male. + +For each sample, the scores are computed on target & antitarget regions separately. When the inferred sex are different, the anitarget is selected. + +.. warning: The sex inference results are questionable. We have observed similar behaviour as decribed `here `_. + +For validation, the _G_ statistic can be obtained from the ``*.cnn`` output using the following: + +.. code-block:: R + + coverage <- read.table("work/mapper.cnvkit.library/out/mapper.cnvkit.library.cnn", sep="\t", header=1) + median_test <- function(x, X=TRUE, shift=0, prefix=c("chr", "")) { + auto <- x$log2[x$chromosome %in% sprintf("%s%d", prefix, 1:22)] + if (X) sex <- x$log2[x$chromosome == sprintf("%sX", prefix)] + shift + else sex <- x$log2[x$chromosome == sprintf("%sY", prefix)] + shift + m <- median(c(auto, sex), na.rm=TRUE) + contigency <- cbind( + c(sum(auto>m, na.rm=TRUE), sum(autom, na.rm=TRUE), sum(sex list[str]: result_files = [] - result_files += list(self.get_output_files("create_panel").values()) - result_files += list(self.get_log_file("create_panel").values()) - - result_files += list(self.get_output_files("target").values()) - result_files += list(self.get_log_file("target").values()) - + actions = ["create_panel", "target", "sex", "metrics"] if self.libraryType == LibraryType.WES: - result_files += list(self.get_output_files("antitarget").values()) - result_files += list(self.get_log_file("antitarget").values()) + actions.append("antitarget") - result_files += list(self.get_output_files("sex").values()) - result_files += list(self.get_log_file("sex").values()) + for action in actions: + result_files += list(self.get_output_files(action).values()) + result_files += list(self.get_log_file(action).values()) return filter(lambda x: not x.endswith(".md5"), result_files) - def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: - assert self.create_access, "Access shouldn't be created, already available" - return {} + def _get_input_files_ignore(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"reference": self.w_config.static_data_config.reference.path} - def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + def _get_args_ignore(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"ignore_chroms": sorted(list(self.ignored))} + + def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: assert self.create_access, "Access shouldn't be created, already available" - return dict(input) | { + input_files = { "reference": self.w_config.static_data_config.reference.path, - "min-gap-size": self.cfg.access.min_gap_size, "exclude": self.cfg.access.exclude, - "ignore_chroms": list(self.ignored), } + if self.ignored: + input_files["ignore_chroms"] = self.base_out.format(**wildcards) + "ignored.bed" + return input_files + + def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "Access shouldn't be created, already available" + return {"min-gap-size": self.cfg.access.min_gap_size} def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: assert ( @@ -697,19 +768,18 @@ def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: input_files["access"] = self.base_out.format(**wildcards) + "access.bed" else: input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.cfg.path_access return input_files def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert ( self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" - args = dict(input) | {"bp-per-bin": 50000} - if self.plain_access: - args["method"] = "wgs" - else: - args["method"] = "amplicon" - if "target" not in args: - args["target"] = self.cfg.path_access + args = { + "bp-per-bin": 50000, + "method": "wgs" if self.plain_access else "amplicon", + } return args def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: @@ -719,7 +789,11 @@ def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: if self.create_access: input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" if self.compute_avg_target_size: - input_files["avg-size"] = self.base_out.format(**wildcards) + "autobin.txt" + input_files["avg_size"] = self.base_out.format(**wildcards) + "autobin.txt" + else: + input_files["interval"] = self.path_baits + if self.w_config.static_data_config.get("features", None): + input_files["annotate"] = self.w_config.static_data_config.features.path return input_files def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: @@ -727,30 +801,31 @@ def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, args = { "avg-size": self.cfg.target.avg_size, "split": self.cfg.target.split, - "interval": self.path_baits, } else: assert self.is_wgs, "Panel not implemented yet" - args = dict(input) | {"split": self.cfg.target.split} - if args.get("avg-size", None) is not None: - args["avg-size"] = self._read_autobin_output(args["avg-size"]) + args = {"split": self.cfg.target.split} + if input.get("avg_size", None) is not None: + args["avg-size"] = self._read_autobin_output(input.get("avg_size")) elif self.cfg.target.avg_size is not None: args["avg-size"] = self.cfg.target.avg_size else: args["avg-size"] = 5000 if self.w_config.static_data_config.get("features", None): - args["annotate"] = self.w_config.static_data_config.features.path args["short-names"] = self.cfg.target.short_names return args def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} if self.create_access: - input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + if not self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["access"] return input_files def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "avg-size": self.cfg.antitarget.avg_size, "min-size": self.cfg.antitarget.min_size, } @@ -767,40 +842,43 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: "intervals": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{region}.bed".format( **wildcards ), + "reference": self.w_config.static_data_config.reference.path, "bam": bam, "bai": bam + ".bai", } def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + return { "min-mapq": self.cfg.coverage.min_mapq, "count": self.cfg.coverage.count, } def _get_input_files_create_panel(self, wildcards: Wildcards) -> dict[str, str]: - tpl = self.base_out_lib + "target.cnn" + tpl = self.base_out_lib + "targetcoverage.cnn" targets = [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() ] if self.libraryType == LibraryType.WES: - tpl = self.base_out_lib + "antitarget.cnn" + tpl = self.base_out_lib + "antitargetcoverage.cnn" antitargets = [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() ] else: antitargets = [] - return {"normals": targets + antitargets} + return { + "reference": self.w_config.static_data_config.reference.path, + "normals": targets + antitargets, + } def _get_args_create_panel(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + edge = self.cfg.edge if self.cfg.edge is not None else self.is_wes + args = { "cluster": self.cfg.cluster, "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, - "no-edge": not self.cfg.get("edge", self.is_wes), + "no-edge": not edge, "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.cluster: @@ -812,13 +890,13 @@ def _get_args_create_panel(self, wildcards: Wildcards, input: InputFiles) -> dic return args def _get_input_files_sex(self, wildcards: Wildcards) -> dict[str, str]: - tpl = self.base_out_lib + "target.cnn" + tpl = self.base_out_lib + "targetcoverage.cnn" coverages = [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() ] if self.is_wes: - tpl = self.base_out_lib + "antitarget.cnn" + tpl = self.base_out_lib + "antitargetcoverage.cnn" coverages += [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() @@ -826,7 +904,25 @@ def _get_input_files_sex(self, wildcards: Wildcards) -> dict[str, str]: return {"coverages": coverages} def _get_args_sex(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | {"diploid-parx-genome": self.cfg.diploid_parx_genome} + return {"diploid-parx-genome": self.cfg.diploid_parx_genome} + + def _get_input_files_metrics(self, wildcards: Wildcards) -> dict[str, str]: + """Input for metrics report. Using coverage rather than ratios, and no segments""" + tpl = self.base_out_lib + "targetcoverage.cnn" + coverages = [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + if self.is_wes: + tpl = self.base_out_lib + "antitargetcoverage.cnn" + coverages += [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + return {"ratios": coverages} + + def _get_args_metrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return {"drop-low-coverage": self.cfg.drop_low_coverage} def _read_autobin_output(self, filename: str) -> int: nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index 59684d40d..78556154c 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -40,8 +40,19 @@ class CnvKit(CnvkitGeneric): path_access: str | None = None """Overrides access when not None""" - ignore_chroms: list[str] = [] - """Additional contigs to ignore""" + ignore_chroms: Annotated[ + list[str], + Field( + examples=[ + "chrM", + "MT", + "*_random", + "chrUn_*", + "GL*", + ] + ), + ] = [] + """Additional contigs to ignore, specific to the tool""" class GenomeName(enum.StrEnum): @@ -101,7 +112,6 @@ class PanelOfNormals(SnappyStepModel, validators.ToolsMixin): "NC_007605", "hs37d5", "chrEBV", - "*_decoy", "HLA-*", "GL000220.*", "chrEBV", diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 1e5a4d097..8120483ca 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -64,7 +64,7 @@ - The ratios (extension ``.cnr``) contains the ratio of expected coverage between tumor and the reference in each bin, or logarithmic scale. This can be used to examine the data or experiment with different segmentation algorithms. -- The segments (extension ``.segments.cns``) contains the output of the segmentation. A single log2 ratio value is +- The segments (extension ``.cns``) contains the output of the segmentation. A single log2 ratio value is attributed to each segment. The segmentation covers most of the part of the genome accessible to mapping. - The calls (extension ``calls.cns``) contains only the non-diploid segments, called after thresholding. @@ -95,17 +95,25 @@ Overview -------- -``cnvkit`` was designed to call CNV on whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), +``cnvkit`` is a set of tools originally designed to call somatic copy number alterations from exome data. +Its design is modular, which enables its use for whole genome and amplicon data. + +Because it was designed primarily for whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), and the _antitargets_ (those regions accessible for CNV calling, but outside of enrichment). The coverage of _targets_ and _antitargets_ are expected to be very different, but there is still information to be gained in the _antitarget_ regions, albeit at a much lower resolution than for _target_ regions. -``cnvkit`` was later used with some success on whole genome data. -WGS data was defined as _target_ regions covering the whole genome, with empty _antitarget_ regions. +For WGS data, the _target_ regions generally cover the whole accessible genome, with empty _antitarget_ regions. .. tip:: For Agilent kits, the ``cnvkit`` authors recommend to use baits as targets. Baits are in the ``*_Covered.bed`` file. +.. note:: + + ``cnvkit`` provides a tool to encapsulate common practice workflows (``batch``), depending on the type of data, and on the availability of optional inputs. + The actual workflow to generate this reference is slightly different between exome and whole genome data. + The current implementation recapitulates the common practice, while still dispaching computations on multiple cluster nodes. + --------------------------------- Regions accessible to CNV calling --------------------------------- @@ -122,6 +130,16 @@ 3. Otherwise, the pipeline creates the accessible regions file from the reference genome sequence, by removing only parts masked with ``N``. +.. note:: + Some short contigs (the mitochondrion, unplaced & unlocalized contigs, decoys, viral sequences, ...) are too short + for a reliable estimation of copy number changes. + ``snappy`` provides a generic way to ignore those contigs during processing (post-mapping), through the ``ignore_chroms`` configuration option. + This parameter is generally set at the _step_ level, typically ignoring decoys, HLA variants and viral sequences. + This is suitable for small variant calling (calling variants for genes on unplaced/unlocalized contigs is fine), + but not for CNA calling. + Therefore, ``ignore_chroms`` can also be set at the _tool_ level in the configuration. In that case, contigs from both levels will be ignored. + External panels (``cohort`` or ``file``) may alreay have been generated on a restricted set of contigs. + ----------------------- The reference coverage ----------------------- @@ -144,6 +162,11 @@ 4. ``flat``: a _flat_ reference is computed, which discards locus-specific effects. It should only be used when there are no normals nor suitable panel, or for benchmarking purposes. +.. warning:: + + When selecting ``cohort`` or ``file`` panels, remember that those might have been generated on a subset of contigs, + if ``ignore_chroms`` was used during the panel creation + ------------------ WGS-specific notes ------------------ @@ -234,6 +257,7 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): #: Class available actions actions = ( + "ignore", "access", "autobin", "target", @@ -401,6 +425,8 @@ def get_output_files(self, action: str): output_files = {} match action: + case "ignore": + output_files = {"ignore_chroms": self.base_out + "ignored.bed"} case "access": output_files = {"access": self.base_out + "access.bed"} case "autobin": @@ -413,7 +439,9 @@ def get_output_files(self, action: str): case "antitarget": output_files = {"antitarget": self.base_out + "antitarget.bed"} case "coverage": - output_files = {"coverage": self.base_out_lib + "{region,(target|antitarget)}.cnn"} + output_files = { + "coverage": self.base_out_lib + "{region,(target|antitarget)}coverage.cnn" + } case "reference": if self.paired: output_files = {"reference": self.base_out_lib + "reference.cnn"} @@ -423,17 +451,17 @@ def get_output_files(self, action: str): output_files = {"ratios": self.base_out_lib + "cnr"} case "segment": output_files = { - "segments": self.base_out_lib + "segments.cns", + "segments": self.base_out_lib + "cns", "dataframe": self.base_out_lib + "rds", } case "call": - output_files = {"calls": self.base_out_lib + "calls.cns"} + output_files = {"calls": self.base_out_lib + "call.cns"} case "bintest": output_files = {"tests": self.base_out_lib + "bintest.cns"} case "metrics": output_files = {"report": base_report_lib + "metrics.tsv"} case "segmetrics": - output_files = {"report": base_report_lib + "segmetrics.tsv"} + output_files = {"report": base_report_lib + "segmetrics.cns"} case "genemetrics": output_files = {"report": base_report_lib + "genemetrics.tsv"} case "scatter": @@ -454,7 +482,7 @@ def get_log_file(self, action): base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." - if action in ("access", "antitarget"): + if action in ("ignore", "access", "antitarget"): tpl = base_log + action elif action in ( "autobin", @@ -512,7 +540,7 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: result_files = [] - for suffix in ("cnr", "segments.cns", "calls.cns", "bintest.cns"): + for suffix in ("cnr", "cns", "call.cns", "bintest.cns"): result_files.append(base_out_lib + suffix) actions_to_log = ("fix", "segment", "call", "bintest") @@ -525,9 +553,9 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: ] # Logs of metrics not linked - for report in ("metrics", "segmetrics", "genemetrics"): + for report, ext in (("metrics", "tsv"), ("segmetrics", "cns"), ("genemetrics", "tsv")): if self.cfg.get(report).get("enabled"): - result_files.append(base_report_lib + report + ".tsv") + result_files.append(base_report_lib + report + "." + ext) # Logs of plots not links # TODO: Mouse date: only chromosomes 1 to 19 @@ -546,11 +574,27 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: result_files += [x + ".md5" for x in result_files] return result_files + # ----- Ignore (generates the bed file of ignored contigs to exclude in access) --------------- + + def _get_input_files_ignore(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"reference": self.w_config.static_data_config.reference.path} + + def _get_args_ignore(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"ignore_chroms": list(self.ignored)} + # ----- Access -------------------------------------------------------------------------------- def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: - assert self.create_access, "Should not build access, already available" - return {} + assert self.create_access, "Access shouldn't be created, already available" + input_files = { + "reference": self.w_config.static_data_config.reference.path, + "exclude": self.cfg.access.exclude, + } + if self.ignored: + input_files["ignore_chroms"] = self.base_out.format(**wildcards) + "ignored.bed" + return input_files def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: """ @@ -563,13 +607,8 @@ def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, This happens when the average target size is set in the config in WGS, or for WES. """ - assert self.create_access, "Should not build access, already available" - return dict(input) | { - "reference": self.w_config.static_data_config.reference.path, - "min-gap-size": self.cfg.access.min_gap_size, - "exclude": self.cfg.access.exclude, - "ignore_chroms": list(self.ignored), - } + assert self.create_access, "Access shouldn't be created, already available" + return {"min-gap-size": self.cfg.access.min_gap_size} # ----- Autobin (never used directly, only to compute target size in WGS settings) ------------ @@ -589,19 +628,19 @@ def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: input_files["access"] = self.base_out.format(**wildcards) + "access.bed" else: input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.cfg.path_access return input_files def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert ( self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" - args = dict(input) | {"bp-per-bin": 50000} + args = {"bp-per-bin": 50000} if self.plain_access: args["method"] = "wgs" else: args["method"] = "amplicon" - if "target" not in args: - args["target"] = self.cfg.path_access return args # ----- Target -------------------------------------------------------------------------------- @@ -623,7 +662,11 @@ def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: if self.create_access: input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" if self.compute_avg_target_size: - input_files["avg-size"] = self.base_out_lib.format(**wildcards) + "autobin.txt" + input_files["avg_size"] = self.base_out_lib.format(**wildcards) + "autobin.txt" + else: + input_files["interval"] = self.path_baits + if self.w_config.static_data_config.get("features", None): + input_files["annotate"] = self.w_config.static_data_config.features.path return input_files def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: @@ -632,19 +675,17 @@ def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, args = { "avg-size": self.cfg.target.avg_size, "split": self.cfg.target.split, - "interval": self.path_baits, } else: assert self.is_wgs, "Panel not implemented yet" - args = dict(input) | {"split": self.cfg.target.split} - if args.get("avg-size", None) is not None: - args["avg-size"] = self._read_autobin_output(args["avg-size"]) + args = {"split": self.cfg.target.split} + if input.get("avg_size", None) is not None: + args["avg-size"] = self._read_autobin_output(input.get("avg_size")) elif self.cfg.target.avg_size is not None: args["avg-size"] = self.cfg.target.avg_size else: args["avg-size"] = 5000 if self.w_config.static_data_config.get("features", None): - args["annotate"] = self.w_config.static_data_config.features.path args["short-names"] = self.cfg.target.short_names return args @@ -653,16 +694,17 @@ def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} if self.create_access: - input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + if not self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["access"] = self.cfg.path_access return input_files def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "avg-size": self.cfg.antitarget.avg_size, "min-size": self.cfg.antitarget.min_size, } - if "access" not in args: - args["access"] = self.cfg.path_access return args # ----- Coverage ------------------------------------------------------------------------------ @@ -682,6 +724,7 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: ngs_mapping = self.parent.sub_workflows["ngs_mapping"] base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) input_files = { + "reference": self.w_config.static_data_config.reference.path, "bam": ngs_mapping(base_path + ".bam"), "bai": ngs_mapping(base_path + ".bam.bai"), } @@ -706,32 +749,30 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: **wildcards ) input_files["intervals"] = panel_of_normals(base_path) + else: + intervals = self.cfg.panel_of_normals.get("path_{region}".format(**wildcards), "") + assert intervals != "", "Missing path to {region}".format(**wildcards) + input_files["intervals"] = intervals return input_files def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + return { "min-mapq": self.cfg.coverage.min_mapq, "count": self.cfg.coverage.count, } - if "intervals" not in args: - intervals = self.cfg.panel_of_normals.get("path_{region}".format(**wildcards), "") - assert intervals != "", "Missing path to {region}".format(**wildcards) - args["intervals"] = intervals - return args # ----- Reference (flat or pairwise) ---------------------------------------------------------- def _get_input_files_reference(self, wildcards: Wildcards) -> dict[str, str]: """Builds reference from the paired normal, or flat prior in absence of normal""" assert self.build_ref, "Should not build reference" - input_files = {} + input_files = {"reference": self.w_config.static_data_config.reference.path} if self.paired: - input_files["normals"] = [self.base_out_lib.format(**wildcards) + "target.cnn"] + input_files["normals"] = [self.base_out_lib.format(**wildcards) + "targetcoverage.cnn"] if self.is_wes: input_files["normals"].append( - self.base_out_lib.format(**wildcards) + "antitarget.cnn" + self.base_out_lib.format(**wildcards) + "antitargetcoverage.cnn" ) elif self.pon_source == PanelOfNormalsOrigin.FLAT: input_files["target"] = self.base_out.format(**wildcards) + "target.bed" @@ -741,12 +782,12 @@ def _get_input_files_reference(self, wildcards: Wildcards) -> dict[str, str]: def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert self.build_ref, "Should not build reference" - args = dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + edge = self.cfg.edge if self.cfg.edge is not None else self.is_wes + args = { "cluster": self.cfg.cluster, "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, - "no-edge": not self.cfg.get("edge", self.is_wes), + "no-edge": not edge, "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.cluster: @@ -766,9 +807,11 @@ def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[s def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: # Coverage on targets & optionally on antitargets - input_files = {"target": self.base_out_lib.format(**wildcards) + "target.cnn"} + input_files = {"target": self.base_out_lib.format(**wildcards) + "targetcoverage.cnn"} if self.is_wes: - input_files["antitarget"] = self.base_out_lib.format(**wildcards) + "antitarget.cnn" + input_files["antitarget"] = ( + self.base_out_lib.format(**wildcards) + "antitargetcoverage.cnn" + ) if self.paired: tpl = "{mapper}.cnvkit.{normal_library}".format( mapper=wildcards["mapper"], @@ -783,31 +826,31 @@ def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: **wildcards ) input_files["reference"] = panel_of_normals(base_path) + else: + input_files["reference"] = self.cfg.panel_of_normals.path_panel_of_normals return input_files def _get_args_fix(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + edge = self.cfg.edge if self.cfg.edge is not None else self.is_wes + return { "cluster": self.cfg.cluster, "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, - "no-edge": not self.cfg.get("edge", self.is_wes), + "no-edge": not edge, "diploid-parx-genome": self.cfg.diploid_parx_genome, + "sample-id": wildcards.library_name, } - args["sample-id"] = wildcards.library_name - if "reference" not in args: - args["reference"] = self.cfg.panel_of_normals.path_panel_of_normals - return args # ----- Variant-related convenience functions ------------------------------------------------- - def _variants_from_cohort_input(self) -> str: + def _variants_from_cohort_input(self, wildcards: Wildcards) -> str: variants = self.parent.sub_workflows["somatic_variant_calling_cnvkit"] tpl = f"{{mapper}}.{self.cfg.somatic_variant_calling.tool}.{{library_name}}" base_path = os.path.join("output", tpl, "out", tpl + ".vcf.gz") - return variants(base_path) + return variants(base_path).format(**wildcards) - def _variants_args(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + def _variants_args(self, wildcards: Wildcards) -> dict[str, str]: + args = { "min-variant-depth": self.cfg.somatic_variant_calling.min_variant_depth, "sample-id": wildcards.library_name, "normal-id": self.parent.matched_normal[wildcards.library_name], @@ -822,13 +865,18 @@ def _get_input_files_segment(self, wildcards: Wildcards) -> dict[str, str]: # Coverage input_files = {"ratios": self.base_out_lib.format(**wildcards) + "cnr"} # Segmentation using SNVs from cohort - if self.variants_from_cohort: - input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + if self.cfg.somatic_variant_calling.enabled: + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input(wildcards) + else: + input_files["variants"] = ( + self.cfg.somatic_variant_calling.path_somatic_variant_calling + ) return input_files def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Segmentation parameters - args = dict(input) | { + args = { "method": self.cfg.segment.method, "threshold": self.cfg.segment.threshold, "drop-outliers": self.cfg.segment.drop_outliers, @@ -837,19 +885,26 @@ def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str if self.cfg.segment.method == CnvkitSegmentationMethod.CBS: args["smooth-cbs"] = self.cfg.segment.smooth_cbs if self.cfg.somatic_variant_calling.enabled: - args |= self._variants_args(wildcards, input) - if "variants" not in args: - args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args |= self._variants_args(wildcards) return args # ----- Call ---------------------------------------------------------------------------------- def _get_input_files_call(self, wildcards: Wildcards) -> dict[str, str]: # Segmentation - input_files = {"segments": self.base_out_lib.format(**wildcards) + "segments.cns"} + if self.cfg.segmetrics.enabled: + tpl = "{mapper}.cnvkit.{library_name}".format(**wildcards) + input_files = {"segments": os.path.join("work", tpl, "report", tpl) + ".segmetrics.cns"} + else: + input_files = {"segments": self.base_out_lib.format(**wildcards) + "cns"} # Segmentation using SNVs from cohort - if self.variants_from_cohort: - input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + if self.cfg.somatic_variant_calling.enabled: + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input(wildcards) + else: + input_files["variants"] = ( + self.cfg.somatic_variant_calling.path_somatic_variant_calling + ) # Purity from the tool if ( self.cfg.somatic_purity_ploidy_estimate.enabled @@ -863,7 +918,7 @@ def _get_input_files_call(self, wildcards: Wildcards) -> dict[str, str]: def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Call parameters - args = dict(input) | { + args = { "method": self.cfg.call.method, "thresholds": self.cfg.call.thresholds, "drop-low-coverage": self.cfg.drop_low_coverage, @@ -875,10 +930,10 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s else: if self.cfg.call.center is not None: args["center"] = self.cfg.call.center + if self.cfg.call.filter is None: + args["filter"] = ["ci"] if self.cfg.somatic_variant_calling.enabled: - args |= self._variants_args(wildcards, input) - if "variants" not in args: - args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args |= self._variants_args(wildcards) # Sample sex if known, otherwise guessed by the tool sample_sex = self._get_sample_sex(wildcards.library_name) if sample_sex is not None: @@ -887,8 +942,8 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s args["male-reference"] = True # If requested, purity from samplesheet or from default if self.cfg.somatic_purity_ploidy_estimate.enabled: - if args.get("purity_file", None) is not None: - (purity, ploidy) = self._read_purity_ploidy_output(args["purity_file"]) + if input.get("purity_file", None) is not None: + (purity, ploidy) = self._read_purity_ploidy_output(input.get("purity_file")) elif self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.SAMPLESHEET: purity = self.tumors[wildcards.library_name].purity ploidy = self.tumors[wildcards.library_name].ploidy @@ -904,11 +959,11 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s def _get_input_files_bintest(self, wildcards: Wildcards) -> dict[str, str]: return { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } def _get_args_bintest(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | { + return { "alpha": self.cfg.bintest.alpha, "target": self.cfg.bintest.target, } @@ -918,14 +973,19 @@ def _get_args_bintest(self, wildcards: Wildcards, input: InputFiles) -> dict[str def _get_input_files_scatter(self, wildcards: Wildcards) -> dict[str, str]: input_files = { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } - if self.variants_from_cohort: - input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + if self.cfg.somatic_variant_calling.enabled: + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input(wildcards) + else: + input_files["variants"] = ( + self.cfg.somatic_variant_calling.path_somatic_variant_calling + ) return input_files def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "antitarget-marker": self.cfg.scatter.antitarget_marker, "by-bin": self.cfg.scatter.by_bin, "segment-color": self.cfg.scatter.segment_color, @@ -940,9 +1000,7 @@ def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str if wildcards["contig_name"] != "all": args["chromosome"] = wildcards["contig_name"] if self.cfg.somatic_variant_calling.enabled: - args |= self._variants_args(wildcards, input) - if "variants" not in args: - args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args |= self._variants_args(wildcards) args["title"] = f"{wildcards['library_name']} - {wildcards['contig_name']}" return args @@ -950,21 +1008,21 @@ def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str def _get_input_files_metrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "ratios": [self.base_out_lib.format(**wildcards) + "cnr"], + "segments": [self.base_out_lib.format(**wildcards) + "cns"], } def _get_args_metrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | {"drop-low-coverage": self.cfg.drop_low_coverage} + return {"drop-low-coverage": self.cfg.drop_low_coverage} def _get_input_files_segmetrics(self, wildcards: Wildcards) -> dict[str, str]: return { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } def _get_args_segmetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | { + return { "drop-low-coverage": self.cfg.drop_low_coverage, "alpha": self.cfg.segmetrics.alpha, "bootstrap": self.cfg.segmetrics.bootstrap, @@ -975,11 +1033,11 @@ def _get_args_segmetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[ def _get_input_files_genemetrics(self, wildcards: Wildcards) -> dict[str, str]: return { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "drop-low-coverage": self.cfg.drop_low_coverage, "male-reference": self.cfg.male_reference, "threshold": self.cfg.genemetrics.threshold, @@ -1001,7 +1059,7 @@ def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict def _read_autobin_output(self, filename: str) -> int: nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" pattern = re.compile("^Target:[ \t]+" + nb + "[ \t]+" + nb + "$") - with open(filename) as f: + with open(filename, "rt") as f: for line in f: m = pattern.match(line) if m: @@ -1012,7 +1070,7 @@ def _read_purity_ploidy_output(self, filename: str) -> tuple[float, float]: # TODO: Tool-dependent parsing of purity/ploidy file nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" pattern = re.compile("^Purity/ploidy:[ \t]+" + nb + "[ \t]+" + nb + "$") - with open(filename) as f: + with open(filename, "rt") as f: for line in f: m = pattern.match(line) if m: diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules index cfaf75206..a6a343e24 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules +++ b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules @@ -1,4 +1,25 @@ +rule somatic_cnv_calling_cnvkit_ignore: + input: + unpack(wf.get_input_files("cnvkit", "ignore")), + output: + **wf.get_output_files("cnvkit", "ignore"), + params: + **{"args": wf.get_args("cnvkit", "ignore")}, + log: + **wf.get_log_file("cnvkit", "ignore"), + threads: wf.get_resource("cnvkit", "ignore", "threads") + resources: + time=wf.get_resource("cnvkit", "ignore", "time"), + memory=wf.get_resource("cnvkit", "ignore", "memory"), + partition=wf.get_resource("cnvkit", "ignore", "partition"), + tmpdir=wf.get_resource("cnvkit", "ignore", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/ignore") + + rule somatic_cnv_calling_cnvkit_access: + input: + unpack(wf.get_input_files("cnvkit", "access")), output: **wf.get_output_files("cnvkit", "access"), params: diff --git a/snappy_wrappers/tools/chromosome_lengths.py b/snappy_wrappers/tools/chromosome_lengths.py new file mode 100644 index 000000000..fd15b0665 --- /dev/null +++ b/snappy_wrappers/tools/chromosome_lengths.py @@ -0,0 +1,850 @@ +"""Human & mouse chromosome lengths for different primary assemblies + +Might be useful to check genome validity, chromosome nameing conventions, ... +""" + +CHROMOSOME_LENGTHS = { + "GRCh37": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000001.10", + "GENBANK": "CM000663 .1", + "Length": 249250621, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000002.11", + "GENBANK": "CM000664 .1", + "Length": 243199373, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000003.11", + "GENBANK": "CM000665 .1", + "Length": 198022430, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000004.11", + "GENBANK": "CM000666 .1", + "Length": 191154276, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000005.9", + "GENBANK": "CM000667 .1", + "Length": 180915260, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000006.11", + "GENBANK": "CM000668 .1", + "Length": 171115067, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000007.13", + "GENBANK": "CM000669 .1", + "Length": 159138663, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000008.10", + "GENBANK": "CM000670 .1", + "Length": 146364022, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000009.11", + "GENBANK": "CM000671 .1", + "Length": 141213431, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000010.10", + "GENBANK": "CM000672 .1", + "Length": 135534747, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000011.9", + "GENBANK": "CM000673 .1", + "Length": 135006516, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000012.11", + "GENBANK": "CM000674 .1", + "Length": 133851895, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000013.10", + "GENBANK": "CM000675 .1", + "Length": 115169878, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000014.8", + "GENBANK": "CM000676 .1", + "Length": 107349540, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000015.9", + "GENBANK": "CM000677 .1", + "Length": 102531392, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000016.9", + "GENBANK": "CM000678 .1", + "Length": 90354753, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000017.10", + "GENBANK": "CM000679 .1", + "Length": 81195210, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000018.9", + "GENBANK": "CM000680 .1", + "Length": 78077248, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000019.9", + "GENBANK": "CM000681 .1", + "Length": 59128983, + }, + { + "UCSC": "chr20", + "ENSEMBL": "20", + "REFSEQ": "NC_000020.10", + "GENBANK": "CM000682 .1", + "Length": 63025520, + }, + { + "UCSC": "chr21", + "ENSEMBL": "21", + "REFSEQ": "NC_000021.8", + "GENBANK": "CM000683 .1", + "Length": 48129895, + }, + { + "UCSC": "chr22", + "ENSEMBL": "22", + "REFSEQ": "NC_000022.10", + "GENBANK": "CM000684 .1", + "Length": 51304566, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000023.10", + "GENBANK": "CM000685 .1", + "Length": 155270560, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000024.9", + "GENBANK": "CM000686 .1", + "Length": 59373566, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_012920.1", + "GENBANK": "J01415.2", + "Length": 16569, + }, + ], + "GRCh38": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000001.11", + "GENBANK": "CM000663 .2", + "Length": 248956422, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000002.12", + "GENBANK": "CM000664 .2", + "Length": 242193529, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000003.12", + "GENBANK": "CM000665 .2", + "Length": 198295559, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000004.12", + "GENBANK": "CM000666 .2", + "Length": 190214555, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000005.10", + "GENBANK": "CM000667 .2", + "Length": 181538259, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000006.12", + "GENBANK": "CM000668 .2", + "Length": 170805979, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000007.14", + "GENBANK": "CM000669 .2", + "Length": 159345973, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000008.11", + "GENBANK": "CM000670 .2", + "Length": 145138636, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000009.12", + "GENBANK": "CM000671 .2", + "Length": 138394717, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000010.11", + "GENBANK": "CM000672 .2", + "Length": 133797422, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000011.10", + "GENBANK": "CM000673 .2", + "Length": 135086622, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000012.12", + "GENBANK": "CM000674 .2", + "Length": 133275309, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000013.11", + "GENBANK": "CM000675 .2", + "Length": 114364328, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000014.9", + "GENBANK": "CM000676 .2", + "Length": 107043718, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000015.10", + "GENBANK": "CM000677 .2", + "Length": 101991189, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000016.10", + "GENBANK": "CM000678 .2", + "Length": 90338345, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000017.11", + "GENBANK": "CM000679 .2", + "Length": 83257441, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000018.10", + "GENBANK": "CM000680 .2", + "Length": 80373285, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000019.10", + "GENBANK": "CM000681 .2", + "Length": 58617616, + }, + { + "UCSC": "chr20", + "ENSEMBL": "20", + "REFSEQ": "NC_000020.11", + "GENBANK": "CM000682 .2", + "Length": 64444167, + }, + { + "UCSC": "chr21", + "ENSEMBL": "21", + "REFSEQ": "NC_000021.9", + "GENBANK": "CM000683 .2", + "Length": 46709983, + }, + { + "UCSC": "chr22", + "ENSEMBL": "22", + "REFSEQ": "NC_000022.11", + "GENBANK": "CM000684 .2", + "Length": 50818468, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000023.11", + "GENBANK": "CM000685 .2", + "Length": 156040895, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000024.10", + "GENBANK": "CM000686 .2", + "Length": 57227415, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_012920.1", + "GENBANK": "J01415.2", + "Length": 16569, + }, + ], + "GRCm38": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000067.6", + "GENBANK": "CM000994 .2", + "Length": 195471971, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000068.7", + "GENBANK": "CM000995 .2", + "Length": 182113224, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000069.6", + "GENBANK": "CM000996 .2", + "Length": 160039680, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000070.6", + "GENBANK": "CM000997 .2", + "Length": 156508116, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000071.6", + "GENBANK": "CM000998 .2", + "Length": 151834684, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000072.6", + "GENBANK": "CM000999 .2", + "Length": 149736546, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000073.6", + "GENBANK": "CM001000 .2", + "Length": 145441459, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000074.6", + "GENBANK": "CM001001 .2", + "Length": 129401213, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000075.6", + "GENBANK": "CM001002 .2", + "Length": 124595110, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000076.6", + "GENBANK": "CM001003 .2", + "Length": 130694993, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000077.6", + "GENBANK": "CM001004 .2", + "Length": 122082543, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000078.6", + "GENBANK": "CM001005 .2", + "Length": 120129022, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000079.6", + "GENBANK": "CM001006 .2", + "Length": 120421639, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000080.6", + "GENBANK": "CM001007 .2", + "Length": 124902244, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000081.6", + "GENBANK": "CM001008 .2", + "Length": 104043685, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000082.6", + "GENBANK": "CM001009 .2", + "Length": 98207768, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000083.6", + "GENBANK": "CM001010 .2", + "Length": 94987271, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000084.6", + "GENBANK": "CM001011 .2", + "Length": 90702639, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000085.6", + "GENBANK": "CM001012 .2", + "Length": 61431566, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000086.7", + "GENBANK": "CM001013 .2", + "Length": 171031299, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000087.7", + "GENBANK": "CM001014 .2", + "Length": 91744698, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_005089.1", + "GENBANK": "AY172335.1", + "Length": 16299, + }, + ], + "GRCm39": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000067.7", + "GENBANK": "CM000994 .3", + "Length": 195154279, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000068.8", + "GENBANK": "CM000995 .3", + "Length": 181755017, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000069.7", + "GENBANK": "CM000996 .3", + "Length": 159745316, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000070.7", + "GENBANK": "CM000997 .3", + "Length": 156860686, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000071.7", + "GENBANK": "CM000998 .3", + "Length": 151758149, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000072.7", + "GENBANK": "CM000999 .3", + "Length": 149588044, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000073.7", + "GENBANK": "CM001000 .3", + "Length": 144995196, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000074.7", + "GENBANK": "CM001001 .3", + "Length": 130127694, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000075.7", + "GENBANK": "CM001002 .3", + "Length": 124359700, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000076.7", + "GENBANK": "CM001003 .3", + "Length": 130530862, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000077.7", + "GENBANK": "CM001004 .3", + "Length": 121973369, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000078.7", + "GENBANK": "CM001005 .3", + "Length": 120092757, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000079.7", + "GENBANK": "CM001006 .3", + "Length": 120883175, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000080.7", + "GENBANK": "CM001007 .3", + "Length": 125139656, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000081.7", + "GENBANK": "CM001008 .3", + "Length": 104073951, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000082.7", + "GENBANK": "CM001009 .3", + "Length": 98008968, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000083.7", + "GENBANK": "CM001010 .3", + "Length": 95294699, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000084.7", + "GENBANK": "CM001011 .3", + "Length": 90720763, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000085.7", + "GENBANK": "CM001012 .3", + "Length": 61420004, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000086.8", + "GENBANK": "CM001013 .3", + "Length": 169476592, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000087.8", + "GENBANK": "CM001014 .3", + "Length": 91455967, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_005089.1", + "GENBANK": "AY172335.1", + "Length": 16299, + }, + ], + "T2T-CHM13v2": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_060925.1", + "GENBANK": "CP068277.2", + "Length": 248387328, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_060926.1", + "GENBANK": "CP068276.2", + "Length": 242696752, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_060927.1", + "GENBANK": "CP068275.2", + "Length": 201105948, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_060928.1", + "GENBANK": "CP068274.2", + "Length": 193574945, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_060929.1", + "GENBANK": "CP068273.2", + "Length": 182045439, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_060930.1", + "GENBANK": "CP068272.2", + "Length": 172126628, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_060931.1", + "GENBANK": "CP068271.2", + "Length": 160567428, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_060932.1", + "GENBANK": "CP068270.2", + "Length": 146259331, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_060933.1", + "GENBANK": "CP068269.2", + "Length": 150617247, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_060934.1", + "GENBANK": "CP068268.2", + "Length": 134758134, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_060935.1", + "GENBANK": "CP068267.2", + "Length": 135127769, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_060936.1", + "GENBANK": "CP068266.2", + "Length": 133324548, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_060937.1", + "GENBANK": "CP068265.2", + "Length": 113566686, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_060938.1", + "GENBANK": "CP068264.2", + "Length": 101161492, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_060939.1", + "GENBANK": "CP068263.2", + "Length": 99753195, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_060940.1", + "GENBANK": "CP068262.2", + "Length": 96330374, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_060941.1", + "GENBANK": "CP068261.2", + "Length": 84276897, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_060942.1", + "GENBANK": "CP068260.2", + "Length": 80542538, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_060943.1", + "GENBANK": "CP068259.2", + "Length": 61707364, + }, + { + "UCSC": "chr20", + "ENSEMBL": "20", + "REFSEQ": "NC_060944.1", + "GENBANK": "CP068258.2", + "Length": 66210255, + }, + { + "UCSC": "chr21", + "ENSEMBL": "21", + "REFSEQ": "NC_060945.1", + "GENBANK": "CP068257.2", + "Length": 45090682, + }, + { + "UCSC": "chr22", + "ENSEMBL": "22", + "REFSEQ": "NC_060946.1", + "GENBANK": "CP068256.2", + "Length": 51324926, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_060947.1", + "GENBANK": "CP068255.2", + "Length": 154259566, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_060948.1", + "GENBANK": "CP086569.2", + "Length": 62460029, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_012920.1", + "GENBANK": "J01415.2", + "Length": 16569, + }, + ], +} diff --git a/snappy_wrappers/tools/genome_windows.py b/snappy_wrappers/tools/genome_windows.py index 2236d6c42..691bf904f 100644 --- a/snappy_wrappers/tools/genome_windows.py +++ b/snappy_wrappers/tools/genome_windows.py @@ -16,7 +16,7 @@ import sys from pathlib import Path - +from typing import Iterator # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its @@ -88,23 +88,17 @@ def yield_regions(fai_file, window_size, subtract_end=0, ignore_chroms=None, pad begin = end -def ignore_chroms(path_ref: str, ignored: set[str] = [], return_ignored: bool = False): - path_ref = Path(path_ref).resolve() - if Path(str(path_ref) + ".fai").exists(): - contigs = _parse_index(Path(str(path_ref) + ".fai"), PATTERN_FAI) - elif Path(str(path_ref) + ".genome").exists(): - contigs = _parse_index(Path(str(path_ref) + ".genome"), PATTERN_GENOME) - elif path_ref.with_suffix("dict").exists(): - contigs = _parse_index(path_ref.with_suffix("dict"), PATTERN_DICT, True) - else: - contigs = _read_fasta(path_ref) - for contig_name, contig_length in contigs: - m = matches_any(contig_name, ignored) - if (m and return_ignored) or (not m and not return_ignored): - yield contig_name, contig_length +def yield_contigs_and_lengths( + filename: Path, pattern: re.Pattern, allow_mismatch: bool = False +) -> Iterator[tuple[str, int]]: + """Yields contig names & lengths from regex pattern matching of sequence dictionary records + :param filename: path to the sequence dictionary file (``*.fai``, ``*.genome`` or ``*.dict``) + :param pattern: regular expression pattern (compiled) to extract contig name & length from sequence dictionary record + :param allow_mismatch: when true, records that don't match the pattern are allowed, otherwise they raise an exception -def _parse_index(filename: Path, pattern: re.Pattern, allow_mismatch: bool = False): + :returns: An iterator giving sequence of names and lengths for all contigs + """ with open(filename, "rt") as f: for line in f: line = line.strip() @@ -119,7 +113,13 @@ def _parse_index(filename: Path, pattern: re.Pattern, allow_mismatch: bool = Fal raise ValueError(f"Unexpected record '{line}' in reference file '{filename}'") -def _read_fasta(filename: Path): +def yield_contigs_and_lengths_from_sequence(filename: Path) -> Iterator[tuple[str, int]]: + """Yields contig names & lengths from parsing the reference sequence + + :param filename: path to the reference sequence in ``fasta`` format + + :returns: An iterator giving sequence of names and lengths for all contigs + """ contig_name = None contig_length = None with open(filename, "rt") as f: @@ -140,6 +140,47 @@ def _read_fasta(filename: Path): yield contig_name, contig_length +def yield_contigs_and_lengths_from_ref(path_ref: str) -> Iterator[tuple[str, int]]: + """Yields all contig names & lengths in the reference sequence + + :param filename: path to the reference sequence + + :returns: An iterator giving sequence of names and lengths for all contigs + + The contig names & lengths are obtained from the sequence dictionary files when possible. + The order is ``*.fai``, ``*.genome``, ``*.dict`` (replacing the ``.fasta`` or ``.fa`` extension). + When none of these files in available, then the sequence itself is used. + + TODO: Add compressed files ``.gz`` & ``.bgz``. + """ + path_ref = Path(path_ref).resolve() + if Path(str(path_ref) + ".fai").exists(): + return yield_contigs_and_lengths(Path(str(path_ref) + ".fai"), PATTERN_FAI) + elif Path(str(path_ref) + ".genome").exists(): + return yield_contigs_and_lengths(Path(str(path_ref) + ".genome"), PATTERN_GENOME) + elif path_ref.with_suffix("dict").exists(): + return yield_contigs_and_lengths(path_ref.with_suffix("dict"), PATTERN_DICT, True) + else: + return yield_contigs_and_lengths_from_sequence(path_ref) + + +def ignore_chroms( + path_ref: str, ignored: set[str] = [], return_ignored: bool = False +) -> Iterator[tuple[str, int]]: + """Yields contig names & lengths belonging or excluding a set of patterns. + + :param filename: path to the reference sequence + :param ignored: set of patterns to identify contigs to be ignored + :param return_ignored: select which set of contigs to return, those which names don't match any pattern, or those which names match one pattern at least. + + :returns: An iterator giving sequence of names and lengths for all contigs to use or to ignore (depending on ``return_ignored``) + """ + for contig_name, contig_length in yield_contigs_and_lengths_from_ref(path_ref): + m = matches_any(contig_name, ignored) + if (m and return_ignored) or (not m and not return_ignored): + yield contig_name, contig_length + + def run(args): """Main entry point after parsing command line arguments""" yielded = 0 diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 4afbbaf99..33a743db2 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -10,36 +10,28 @@ base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) sys.path.insert(0, base_dir) -from snappy_wrappers.tools.genome_windows import ignore_chroms from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" args = snakemake.params.get("args", {}) - -prefix = "" +exclude = args.get("exclude", []) # Add the "ignore_chrom" contents to the excluded regions -if len(args.get("ignore_chroms", [])) > 0: - ignored_contigs = ignore_chroms(args["reference"], args["ignore_chroms"], return_ignored=True) - lines = ["cat << __EOF > $TMPDIR/ignore_chroms.bed"] - for (contig_name, contig_length) in ignored_contigs: - lines.append(f"{contig_name}\t0\t{contig_length}") - lines.append("__EOF") - prefix = "\n".join(lines) + "\n" - args["exclude"].append("$TMPDIR/ignore_chroms.bed") +if snakemake.input.get("ignore_chroms", None) is not None: + exclude.append(snakemake.input.get("ignore_chroms")) cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ {min_gap_size} {exclude} \ - {args[reference]} + {snakemake.input.reference} """.format( snakemake=snakemake, args=args, min_gap_size=f"--min-gap-size {args['min-gap-size']}" if args.get("min-gap-size", None) is not None else "", - exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]), + exclude=" ".join([f"--exclude {x}" for x in exclude]), ) -CnvkitWrapper(snakemake, prefix + cmd).run() +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 5fb01d78b..d70c9a297 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -21,11 +21,12 @@ cnvkit.py antitarget \ -o {snakemake.output.antitarget} \ --avg-size {args[avg-size]} {min_size} \ - --access {args[access]} \ - {args[target]} + {access} \ + {snakemake.input.target} """.format( snakemake=snakemake, args=args, + access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) is not None else "", min_size=f"--min-size {args['min-size']}" if args.get("min-size") is not None else "", ) diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index 5020ac227..5d9def018 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -22,15 +22,15 @@ {out_target} {out_antitarget} \ {access} {target} \ --bp-per-bin {args[bp-per-bin]} \ - {args[bams]} \ + {snakemake.input.bams} \ > {snakemake.output.result} """.format( snakemake=snakemake, args=args, out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", - access=f"--access {args['access']}" if args.get("access", None) is not None else "", - target=f"--targets {args['target']}" if args.get("target", None) is not None else "", + access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) is not None else "", + target=f"--targets {snakemake.input.target}" if snakemake.input.get("target", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py index 0ea46cf30..cfca4ecbe 100644 --- a/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py @@ -20,9 +20,9 @@ cmd = r""" cnvkit.py bintest \ -o {snakemake.output.tests} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ --alpha {args[alpha]} {target} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index 7cf317aa6..757e1e909 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -2,7 +2,6 @@ """Wrapper vor cnvkit.py call""" import os -import re import sys # The following is required for being able to import snappy_wrappers modules @@ -15,12 +14,9 @@ args = snakemake.params.get("args", {}) -PATTERN = re.compile("^(Purity|Ploidy): +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") - - -if "variants" in args: +if snakemake.input.get("variants", None) is not None: variants = r""" - ---vcf {args[variants]} \ + ---vcf {snakemake.input.variants} \ --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( @@ -39,7 +35,7 @@ {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} {diploid_parx_genome} \ {purity} {ploidy} \ {variants} \ - {args[segments]} + {snakemake.input.segments} """.format( snakemake=snakemake, args=args, @@ -47,7 +43,7 @@ purity=f"--purity {args['purity']}" if args.get("purity", None) is not None else "", ploidy=f"--ploidy {args['ploidy']}" if args.get("ploidy", None) is not None else "", thresholds="--thresholds={}".format(",".join(map(str, args["thresholds"]))) if len(args.get("thresholds", [])) > 0 else "", - filter=f"--filter {args['filter']}" if args.get("filter", None) is not None else "", + filter="--filter {}".format(" ".join(args["filter"])) if len(args.get("filter", [])) > 0 else "", center=f"--center {args['center']}" if args.get("center", None) is not None else "", center_at=f"--center-at {args['center-at']}" if args.get("center-at", None) is not None else "", drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index 303127160..569da948f 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -20,9 +20,9 @@ cmd = r""" cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {args[reference]} \ + --fasta {snakemake.input.reference} \ --min-mapq {args[min-mapq]} {count} \ - {args[bam]} {args[intervals]} + {snakemake.input.bam} {snakemake.input.intervals} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index a5b56c8e4..a79f2c978 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -18,20 +18,23 @@ args = snakemake.params.get("args", {}) # Fix requires empty antitarget file in WGS & Panel modes -create_dummy_antitarget = "" -if args.get("antitarget", "") == "": - args["antitarget"] = "$TMPDIR/antitarget.bed" - create_dummy_antitarget = f"touch {args['antitarget']} ; " +if snakemake.input.get("antitarget", None) is None: + antitarget = "$TMPDIR/antitarget.bed" + create_dummy_antitarget = f"touch {antitarget} ; " +else: + antitarget = snakemake.input.antitarget + create_dummy_antitarget = "" cmd = r""" cnvkit.py fix \ -o {snakemake.output.ratios} \ {cluster} --sample-id {args[sample-id]} \ {no_gc} {no_edge} {no_rmask} \ - {args[target]} {args[antitarget]} {args[reference]} + {snakemake.input.target} {antitarget} {snakemake.input.reference} """.format( snakemake=snakemake, args=args, + antitarget=antitarget, cluster="--cluster" if args.get("cluster", False) else "", no_gc="--no-gc" if args.get("no-gc", False) else "", no_edge="--no-edge" if args.get("no-edge", False) else "", diff --git a/snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml b/snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py b/snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py new file mode 100644 index 000000000..10743b88b --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py access""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.tools.genome_windows import ignore_chroms +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +ignored_contigs = ignore_chroms(snakemake.input.reference, args["ignore_chroms"], return_ignored=True) +lines = [] +for (contig_name, contig_length) in ignored_contigs: + lines.append(f"{contig_name}\t0\t{contig_length}") +lines = "\n".join(lines) + +cmd = f""" +cat << __EOF > {snakemake.output.ignore_chroms} +{lines} +__EOF +""" + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py index c4b475f1b..bd3f14201 100644 --- a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py scatter""" +import csv import os -import re import sys # The following is required for being able to import snappy_wrappers modules @@ -15,9 +15,33 @@ args = snakemake.params.get("args", {}) -if "variants" in args: +# Fix chromosome name prefix +if args.get("chromosome", None) is not None: + chromosome = args["chromosome"] + if chromosome.startswith("chr"): + ucsc = chromosome + ensembl = chromosome[3:] + if ensembl == "M": + ensembl = "MT" + else: + ucsc = f"chr{chromosome}" + ensembl = chromosome + if ucsc == "chrMT": + ucsc = "chrM" + + with open(snakemake.input.segments, "rt") as f: + reader = csv.DictReader(f, delimiter="\t") + for record in reader: + if ucsc == record["chromosome"]: + args["chromosome"] = ucsc + break + if ensembl == record["chromosome"]: + args["chromosome"] = ensembl + break + +if snakemake.input.get("variants", None) is not None: variants = r""" - ---vcf {args[variants]} \ + ---vcf {snakemake.input.variants} \ --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( @@ -31,21 +55,21 @@ cmd = r""" cnvkit.py scatter \ -o {snakemake.output.plot} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ {chromosome} {gene} {range_list} \ --width {args[width]} \ --antitarget-marker {args[antitarget-marker]} --segment-color {args[segment-color]} \ {by_bin} {trend} --title "{args[title]}" \ {y_min} {y_max} {fig_size} \ {variants} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, variants=variants, chromosome=f"--chromosome {args['chromosome']}" if args.get("chromosome", None) is not None else "", gene=f"--gene {args['gene']}" if args.get("gene", None) is not None else "", - range_list=f"--range-list {args['range-list']}" if args.get("range-list", None) is not None else "", + range_list=f"--range-list {snakemake.input.range_list}" if snakemake.input.get("range_list", None) is not None else "", by_bin="--by-bin" if args.get("by-bin", False) else "", trend="--trend" if args.get("trend", False) else "", y_min=f"--y-min {args['y-min']}" if args.get("y-min", None) is not None else "", diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 6a33406da..2dc029871 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -17,13 +17,10 @@ args = snakemake.params.get("args", {}) -target = f"--target {args['target']}" if "target" in args else "" -antitarget = f"--antitarget {args['antitarget']}" if "antitarget" in args else "" - cmd = r""" cnvkit.py reference \ -o {snakemake.output.reference} \ - --fasta {args[reference]} \ + --fasta {snakemake.input.reference} \ {cluster} {min_cluster_size} \ {sample_sex} {male_reference} {diploid_parx_genome} \ {no_gc} {no_edge} {no_rmask} \ @@ -31,9 +28,9 @@ """.format( snakemake=snakemake, args=args, - target=target, - antitarget=antitarget, - normals=" ".join(args["normals"]) if len(args.get("normals", [])) > 0 else "", + target=f"--target {snakemake.input.target}" if snakemake.input.get("target", None) is not None else "", + antitarget=f"--antitarget {snakemake.input.antitarget}" if snakemake.input.get("antitarget", None) is not None else "", + normals=" ".join(snakemake.input.normals) if len(snakemake.input.normals) > 0 else "", cluster="--cluster" if args.get("cluster", False) else "", male_reference="--male-reference" if args.get("male-reference", False) else "", no_gc="--no-gc" if args.get("no-gc", False) else "", diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py index f9c370b36..4d5826063 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -17,11 +17,11 @@ cmd = r""" cnvkit.py genemetrics \ -o {snakemake.output.report} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ --threshold {args[threshold]} --min-probes {args[min-probes]} \ {drop_low_coverage} {male_reference} {sample_sex} {diploid_parx_genome} \ {stats} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py index d4a2fdc92..2fbb72c9b 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py @@ -19,13 +19,13 @@ cnvkit.py metrics \ -o {snakemake.output.report} \ {drop_low_coverage} \ - {args[ratios]} \ - --segment {segments} + {snakemake.input.ratios} \ + {segments} """.format( snakemake=snakemake, args=args, drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", - segments=" ".join(args["segments"]), + segments=f"--segments {snakemake.input.segments}" if snakemake.input.get("segments", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py index 76c6831e0..99330115e 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py @@ -17,11 +17,11 @@ cmd = r""" cnvkit.py segmetrics \ -o {snakemake.output.report} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ --alpha {args[alpha]} --bootstrap {args[bootstrap]} {smooth_bootstrap} \ {drop_low_coverage} \ {stats} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 240cdfa28..614e4b458 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -14,11 +14,11 @@ args = snakemake.params.get("args", {}) -if "variants" in args: +if snakemake.input.get("variants", None) is not None: variants = r""" - ---vcf {args[variants]} \ + ---vcf {snakemake.input.variants} \ --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ - {args[min-variant-depth]} {zygocity_freq} + --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -33,7 +33,7 @@ --method {args[method]} --threshold {args[threshold]} {smooth_cbs} \ {drop_low_coverage} --drop-outliers {args[drop-outliers]} \ {variants} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py index 1869d8040..3fb4d36c0 100644 --- a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py @@ -27,7 +27,7 @@ snakemake=snakemake, args=args, diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get('diploid-parx-genome', None) is not None else "", - coverages=" ".join(args["coverages"]), + coverages=" ".join(snakemake.input.coverages), ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index 37e1bd9c2..c05aed668 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -22,13 +22,13 @@ cnvkit.py target \ -o {snakemake.output.target} \ {avg_size} {split} {annotate} {short_names} \ - {args[interval]} + {snakemake.input.interval} """.format( snakemake=snakemake, args=args, avg_size=f"--avg-size {args['avg-size']}" if args.get("avg-size", None) is not None else "", split=f"--split" if args.get("split", False) else "", - annotate=f"--annotate {args['annotate']}" if args.get("annotate", None) is not None else "", + annotate=f"--annotate {snakemake.input.annotate}" if snakemake.input.get("annotate", None) is not None else "", short_names="--short-names" if args.get("short-names", False) else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 5ce31759d..ab57d8cee 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -345,21 +345,43 @@ def test_purecn_step_part_get_resource_usage(panel_of_normals_workflow): # Tests for CnvkitStepPart ------------------------------------------------------------------------ -def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_access()""" +def test_cnvkit_step_part_get_input_files_ignore(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_ignore()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) - actual = panel_of_normals_workflow.get_args("cnvkit", "access")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards), - ) - if actual.get("ignore_chroms", None) is not None: - actual["ignore_chroms"].sort() - expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} + actual = panel_of_normals_workflow.get_input_files("cnvkit", "ignore")(wildcards) + expected = {"reference": "/path/to/ref.fa"} assert actual == expected -def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): +def test_cnvkit_step_part_get_args_ignore(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_ignore()""" + actual = panel_of_normals_workflow.get_args("cnvkit", "ignore")(None, None) + actual["ignore_chroms"].sort() + expected = {"ignore_chroms": ["GL*", "MT"]} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards) + expected = { + "reference": "/path/to/ref.fa", + "exclude": [], + "ignore_chroms": "work/bwa.cnvkit/out/bwa.cnvkit.ignored.bed", + } + assert actual == expected + + +def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): """Tests CnvkitStepPart._get_args_access()""" + actual = panel_of_normals_workflow.get_args("cnvkit", "access")(None, None) + expected = {"min-gap-size": None} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) expected = { "bams": [ @@ -367,36 +389,43 @@ def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", ], "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "method": "wgs", - "bp-per-bin": 50000, } - actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards), - ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_target()""" +def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" + expected = {"method": "wgs", "bp-per-bin": 50000} + actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) expected = { "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "avg-size": 2000, - "split": True, + "avg_size": "work/bwa.cnvkit/out/bwa.cnvkit.autobin.txt", "annotate": "/path/to/annotations.gtf", - "short-names": True, - } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_target()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = {"avg-size": 2000, "split": True, "short-names": True} actual = panel_of_normals_workflow.get_args("cnvkit", "target")( wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards), + panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) ) assert actual == expected -def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_coverage()""" +def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -409,25 +438,35 @@ def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", "reference": "/path/to/ref.fa", - "min-mapq": 0, - "count": False, } - actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards), - ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_create_panel()""" +def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + expected = {"min-mapq": 0, "count": False} + actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_reference(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_create_panel()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) expected = { "normals": [ - "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn", - "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], "reference": "/path/to/ref.fa", + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" + expected = { "cluster": False, "no-gc": False, "no-edge": True, @@ -435,23 +474,42 @@ def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): "male-reference": False, "diploid-parx-genome": "GRCh38", } - actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards), - ) + actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_sex(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_sex()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "coverages": [ + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "sex")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_sex(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_sex()""" + expected = {"diploid-parx-genome": "GRCh38"} + actual = panel_of_normals_workflow.get_args("cnvkit", "sex")(None, None) assert actual == expected def test_cnvkit_step_parts_get_output_files(panel_of_normals_workflow): """Tests CnvkitStepPart.get_output_files() for all actions""" actions = { + "ignore": {"ignore_chroms": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.ignored.bed"}, "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, "autobin": {"result": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.autobin.txt"}, "target": {"target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed"}, "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, - "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}coverage.cnn"}, "create_panel": {"reference": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"}, - "sex": {"sex": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.sex.tsv"}, + "sex": {"sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv"}, + "metrics": {"metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv"}, } for action, result in actions.items(): expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} @@ -462,7 +520,7 @@ def test_cnvkit_step_parts_get_output_files(panel_of_normals_workflow): def test_cnvkit_step_parts_get_log_file(panel_of_normals_workflow): """Tests CnvkitStepPart.get_log_file() for all actions""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - actions = ("autobin", "target", "create_panel", "sex") + actions = ("ignore", "access", "autobin", "target", "create_panel", "sex", "metrics") base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit" for action in actions: result = {k: base_log + f".{action}.{v}" for k, v in exts} @@ -471,16 +529,6 @@ def test_cnvkit_step_parts_get_log_file(panel_of_normals_workflow): assert actual == expected -def test_cnvkit_step_parts_get_log_file_access(panel_of_normals_workflow): - """Tests CnvkitStepPart.get_log_file() for access""" - exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" - result = {k: base_log + f".{v}" for k, v in exts} - expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} - actual = panel_of_normals_workflow.get_log_file("cnvkit", "access") - assert actual == expected - - def test_cnvkit_step_parts_get_log_file_coverage(panel_of_normals_workflow): """Tests CnvkitStepPart.get_log_file() for coverage""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) @@ -525,12 +573,20 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += [ tpl.format(mapper=mapper, substep=substep, ext=ext, chksum=chksum) for chksum in ("", ".md5") - for (substep, ext) in (("panel_of_normals", "cnn"), ("sex", "tsv"), ("target", "bed")) + for (substep, ext) in (("panel_of_normals", "cnn"), ("target", "bed")) + for mapper in ("bwa",) + ] + # Add report files + tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}{chksum}" + expected += [ + tpl.format(mapper=mapper, substep=substep, ext=ext, chksum=chksum) + for chksum in ("", ".md5") + for (substep, ext) in (("metrics", "tsv"), ("sex", "tsv")) for mapper in ("bwa",) ] # add log files tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" - for substep in ("create_panel", "sex", "target"): + for substep in ("create_panel", "metrics", "sex", "target"): for mapper in ("bwa",): base_out = tpl.format(mapper=mapper, substep=substep) expected += get_expected_log_files_dict(base_out=base_out).values() diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index e9e7daa42..b245f4f35 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -137,25 +137,65 @@ def somatic_cnv_calling_workflow( # Tests for CnvkitStepPart ------------------------------------------------------------------------ +def test_cnvkit_step_part_get_input_files_ignore(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_ignore()""" + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "ignore")(None) + expected = {"reference": "/path/to/ref.fa"} + assert actual == expected + + +def test_cnvkit_step_part_get_args_ignore(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_ignore()""" + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "ignore")(None, None) + actual["ignore_chroms"].sort() + expected = {"ignore_chroms": ["GL*", "MT"]} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_access(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards) + expected = { + "reference": "/path/to/ref.fa", + "exclude": [], + "ignore_chroms": "work/bwa.cnvkit/out/bwa.cnvkit.ignored.bed", + } + assert actual == expected + + def test_cnvkit_step_part_get_args_access(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_access()""" + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "access")(None, None) + expected = {"min-gap-size": None} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_autobin(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", } ) - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "access")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards), - ) - if actual.get("ignore_chroms", None) is not None: - actual["ignore_chroms"].sort() - expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} + expected = { + "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], + "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "autobin")(wildcards) assert actual == expected def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_access()""" + expected = {"method": "wgs", "bp-per-bin": 50000} + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "autobin")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_target(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -163,15 +203,11 @@ def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): } ) expected = { - "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], - "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "method": "wgs", - "bp-per-bin": 50000, + "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + "avg_size": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.autobin.txt", + "annotate": "/path/to/annotations.gtf", } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "autobin")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "autobin")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards) assert actual == expected @@ -183,47 +219,42 @@ def test_cnvkit_step_part_get_args_target(somatic_cnv_calling_workflow): "library_name": "P001-N1-DNA1-WGS1", } ) - expected = { - "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "avg-size": 2000, - "split": True, - "annotate": "/path/to/annotations.gtf", - "short-names": True, - - } + expected = {"avg-size": 2000, "split": True, "short-names": True} actual = somatic_cnv_calling_workflow.get_args("cnvkit", "target")( wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards), + somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards) ) assert actual == expected -def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart._get_args_coverage()""" +def test_cnvkit_step_part_get_input_files_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", - "library_name": "P001-N1-DNA1-WGS1", + "library_name": "P001-T1-DNA1-WGS1", "region": "target", } ) expected = { "intervals": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.bed", - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", "reference": "/path/to/ref.fa", - "min-mapq": 0, - "count": False, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "coverage")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart._get_args_reference()""" +def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + expected = {"min-mapq": 0, "count": False} + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "coverage")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_reference(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_create_panel()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -232,8 +263,24 @@ def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): } ) expected = { - "normals": ["work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn"], + "normals": [ + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + ], "reference": "/path/to/ref.fa", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "reference")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { "cluster": False, "no-gc": False, "no-edge": True, @@ -241,10 +288,23 @@ def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): "male-reference": False, "diploid-parx-genome": "GRCh38", } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "reference")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "reference")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "reference")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_fix(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_fix()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } ) + expected = { + "target": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn", + "reference": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.reference.cnn", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards) assert actual == expected @@ -257,8 +317,6 @@ def test_cnvkit_step_part_get_args_fix(somatic_cnv_calling_workflow): } ) expected = { - "target": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.target.cnn", - "reference": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.reference.cnn", "cluster": False, "no-gc": False, "no-edge": True, @@ -266,10 +324,23 @@ def test_cnvkit_step_part_get_args_fix(somatic_cnv_calling_workflow): "diploid-parx-genome": "GRCh38", "sample-id": "P001-T1-DNA1-WGS1", } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "fix")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "fix")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_segment(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_segment()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } ) + expected = { + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards) assert actual == expected @@ -282,21 +353,33 @@ def test_cnvkit_step_part_get_args_segment(somatic_cnv_calling_workflow): } ) expected = { - "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", "method": "cbs", "threshold": 0.0001, "smooth-cbs": False, "drop-low-coverage": False, "drop-outliers": 10, - "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", "sample-id": "P001-T1-DNA1-WGS1", "normal-id": "P001-N1-DNA1-WGS1", "min-variant-depth": 20, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segment")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segment")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_call(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_call()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/report/bwa.cnvkit.P001-T1-DNA1-WGS1.segmetrics.cns", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "purity_file": "SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards) assert actual == expected @@ -309,19 +392,17 @@ def test_cnvkit_step_part_get_args_call(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", "method": "threshold", "thresholds": [-1.1, -0.25, 0.2, 0.7], "drop-low-coverage": False, "male-reference": False, "diploid-parx-genome": "GRCh38", - "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", "sample-id": "P001-T1-DNA1-WGS1", "normal-id": "P001-N1-DNA1-WGS1", "min-variant-depth": 20, - "purity_file": "SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", "purity": 0.35, "ploidy": 2.2, + "filter": ["ci"], } actual = somatic_cnv_calling_workflow.get_args("cnvkit", "call")( wildcards, @@ -330,8 +411,8 @@ def test_cnvkit_step_part_get_args_call(somatic_cnv_calling_workflow): assert actual == expected -def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart._get_args_bintest()""" +def test_cnvkit_step_part_get_input_files_bintest(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_bintest()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -339,21 +420,48 @@ def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "bintest")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_bintest()""" + expected = { "alpha": 0.005, "target": False, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "bintest")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "bintest")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "bintest")(None, None) assert actual == expected + + +def test_cnvkit_step_part_get_input_files_metrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_metrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": ["work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns"], + "ratios": ["work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"], + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "metrics")(wildcards) assert actual == expected def test_cnvkit_step_part_get_args_metrics(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_metrics()""" + expected = {"drop-low-coverage": False} + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "metrics")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_segmetrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_segmetrics()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -361,19 +469,28 @@ def test_cnvkit_step_part_get_args_metrics(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", - "drop-low-coverage": False, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "metrics")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "metrics")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "segmetrics")(wildcards) assert actual == expected def test_cnvkit_step_part_get_args_segmetrics(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_segmetrics()""" + expected = { + "drop-low-coverage": False, + "alpha": 0.05, + "bootstrap": 100, + "smooth-bootstrap": True, + "stats": ["mean", "median", "mode", "t-test", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segmetrics")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_genemetric(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_genemetrics()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -381,18 +498,10 @@ def test_cnvkit_step_part_get_args_segmetrics(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", - "drop-low-coverage": False, - "alpha": 0.05, - "bootstrap": 100, - "smooth-bootstrap": False, - "stats": ["mean", "median", "mode", "t-test", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segmetrics")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "segmetrics")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "genemetrics")(wildcards) assert actual == expected @@ -405,8 +514,6 @@ def test_cnvkit_step_part_get_args_genemetric(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", - "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", "threshold": 0.2, "min-probes": 3, "drop-low-coverage": False, @@ -416,10 +523,25 @@ def test_cnvkit_step_part_get_args_genemetric(somatic_cnv_calling_workflow): "bootstrap": 100, "stats": ["mean", "median", "mode", "ttest", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "genemetrics")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "genemetrics")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "genemetrics")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_scatter(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_scatter()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + "contig_name": "1", + } ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "scatter")(wildcards) assert actual == expected @@ -433,8 +555,6 @@ def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", - "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", "chromosome": "1", "width": 1000000, "antitarget-marker": "o", @@ -442,38 +562,35 @@ def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): "trend": False, "segment-color": "darkorange", "title": "P001-T1-DNA1-WGS1 - 1", - "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", "sample-id": "P001-T1-DNA1-WGS1", "normal-id": "P001-N1-DNA1-WGS1", "min-variant-depth": 20, - "fig-size": (6.4, 4.8), + "fig-size": (12.256, 16.192), } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "scatter")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "scatter")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "scatter")(wildcards, None) assert actual == expected def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): """Tests CnvkitStepPart.get_output_files() for all actions""" actions = { + "ignore": {"ignore_chroms": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.ignored.bed"}, "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, "autobin": {"result": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt"}, "target": {"target": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.target.bed"}, "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, - "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}coverage.cnn"}, "reference": {"reference": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.reference.cnn"}, "fix": {"ratios": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr"}, "segment": { - "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segments.cns", + "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns", "dataframe": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.rds", }, - "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.calls.cns"}, + "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns"}, "bintest": {"tests": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bintest.cns"}, "metrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.metrics.tsv"}, "genemetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.genemetrics.tsv"}, - "segmetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.segmetrics.tsv"}, + "segmetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.segmetrics.cns"}, "scatter": {"plot": "work/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.scatter.{contig_name}.jpeg"}, } for action, result in actions.items(): @@ -494,14 +611,16 @@ def test_cnvkit_step_parts_get_log_file(somatic_cnv_calling_workflow): assert actual == expected -def test_cnvkit_step_parts_get_log_file_access(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart.get_log_file() for access""" +def test_cnvkit_step_parts_get_log_file_no_lib(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for all actions not dependent on library""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" - result = {k: base_log + f".{v}" for k, v in exts} - expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} - actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "access") - assert actual == expected + actions = ("ignore", "access") + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." + for action in actions: + result = {k: base_log + f"{action}.{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", action) + assert actual == expected def test_cnvkit_step_parts_get_log_file_coverage(somatic_cnv_calling_workflow): @@ -542,7 +661,7 @@ def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): tpl = "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" expected += [ tpl.format(mapper=mapper, library_name=library_name, ext=ext) - for ext in ("cnr", "segments.cns", "calls.cns", "bintest.cns") + for ext in ("cnr", "cns", "call.cns", "bintest.cns") for library_name in tumor_libraries for mapper in ("bwa",) ] @@ -561,8 +680,7 @@ def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): tpl = "output/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.{step}.{ext}" expected += [ tpl.format(mapper=mapper, library_name=library_name, step=step, ext=ext) - for ext in ("tsv",) - for step in ("metrics", "genemetrics", "segmetrics") + for step, ext in (("metrics", "tsv"), ("genemetrics", "tsv"), ("segmetrics", "cns")) for library_name in tumor_libraries for mapper in ("bwa",) ] From 650bf1d30cdb580f18a2017525d8dc7674716efc Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 6 Dec 2024 12:34:50 +0100 Subject: [PATCH 20/46] docs: Added somatic cnv calling step --- docs/index.rst | 4 +-- docs/somatic_cnv.rst | 49 ++++++++++++++++++++++++++----- docs/step/somatic_cnv_calling.rst | 7 +++++ 3 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 docs/step/somatic_cnv_calling.rst diff --git a/docs/index.rst b/docs/index.rst index 19f7cc3cd..c4493e7a0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -68,13 +68,13 @@ Project Info step/igv_session_generation step/ngs_data_qc step/ngs_mapping + step/panel_of_normals step/somatic_gene_fusion_calling step/somatic_purity_ploidy_estimate - step/somatic_targeted_seq_cnv_calling + step/somatic_cnv_calling step/somatic_variant_annotation step/somatic_variant_calling step/somatic_variant_filtration - step/somatic_wgs_cnv_calling step/somatic_wgs_sv_calling step/sv_calling_targeted step/targeted_seq_mei_calling diff --git a/docs/somatic_cnv.rst b/docs/somatic_cnv.rst index bc2e1e005..68e817f04 100644 --- a/docs/somatic_cnv.rst +++ b/docs/somatic_cnv.rst @@ -4,17 +4,47 @@ Somatic CNV calling ------------------- -Somatic variant calling is implemented differently for exome and whole genome data. - -The whole genome data "branch" is currently under review, as GRCh38 support in ``Control-FREEC`` (the main workhorse for WGS CNV calling) is not complete. -CNV calling in WGS data can also be done using ``cnvkit``, but its pipeline implementation is also incomplete. - -The following documentation is restricted to the tools currently implemented to process exome data: ``cnvkit``, ``purecn`` & ``sequenza``. +Somatic variant calling was implemented differently for exome and whole genome data. +We are aiming to merge the two data types, but this isn't complete yet, and the new ``somatic_cnv_calling`` step coexists +with the former ``somatic_wgs_cnv_calling`` & ``somatic_targeted_seq_cnv_calling`` steps. + +Among the tools for CNV calling, some are restricted to WES (``PureCN`` & ``Sequenza``), others to WGS (``Control-FREEC``) and +others can support both (``CNVkit``). + +We started by implementing the ``cnvkit`` tool in the new step, but support for the other tools will follow soon, +but a complete implementation is difficult. The reasons are: + +- Many of these tools (in particular ``CNVkit`` & ``PureCN``) are modular, which allows them flexibility, but + at the expense of additional complexity in the pipeline. +- Some tools require a *reference*, or *panel of normals*, which can be either obtained from public data, or created from the cohort itself. + In the latter case, the pipeline must trigger panel of normals creation when needed. +- Germline & somatic variants can be used by some tools to improve segmentation & calling. + Again, the pipeline must trigger creation of those. +- When available, purity & ploidy values improve CNV calling. While some tools can estimate these values internally, + others require running an additional step. + +So in the future we might find ourselves in a position where, to generate CNV calls by ``CNVkit``, the chain of data requests goes: + +1. ``cnvkit`` requires a panel of normals, purity/ploidy from tool ``purecn`` in ``somatic_purity_ploidy_estimate`` step, & + germline/somatic variants from tool ``mutect2`` in ``somatic_variant_calling`` step. +2. The ``panel_of_normals`` step is triggered for tool ``cnvkit``, and in parallel, the ``somatic_variant_calling`` step + is triggered for tool ``mutect2``. +3. For ``purecn``, the ``somatic_purity_ploidy_estimate`` step is simply a copy of the results obtained in + the ``somatic_cnv_calling`` step, as ``PureCN`` estimates purity & ploidy during CNV calling. + So the ``somatic_cnv_calling`` step must be run for ``purecn``. +4. But ``purecn`` itself requires a panel of normals, & germline/somatic variants produced in the ``somatic_variant_calling`` step, + by the ``mutect2`` tool. +5. The ``panel_of_normals`` step is triggered for tool ``purecn``, while the ``somatic_variant_calling`` step, already triggered + at ``cnvkit`` request, may or may not be computed. + +So the final dependencies graph is quite complex, and assumes that the ``mutect2`` variant generation parameters +necessary for ``CNVkit`` are compatible with with those required by ``PureCN``. Output & performance ==================== -The 3 methods generally broadly agree on the log ratio of coverage between tumor & normal samples. +For WES data, the 3 methods currently implemented (``CNVkit``, ``PureCN`` & ``Sequenza``) generally broadly agree +on the log ratio of coverage between tumor & normal samples. However, the segmentation and the number of copies assigned to a segment can be quite different between the algorithms. @@ -26,6 +56,10 @@ In absence of a better solution, all CNV tools implemented in somatic pipeline o The copy number call may or may not be present, and the chromosome number is replaced by its name. The segmentation output is in file ``output/../out/.._dnacopy.seg``. +The new step doesn't follow this convention, but keeps the output files generated by ``cnvkit`` unchanged. +When a final decision is made regarding the best format(s) to describe CNVs, then we will implement an ``export`` sub-step +which converts private formats to public one(s). + Genome support -------------- @@ -93,4 +127,3 @@ From the ``panel_of_normals`` directory, ``purecn`` requires 3 types of files: - the ``panel_of_normals`` itself, and the ``mapping_bias`` objects are taken from ``.purecn/out``. This is because they might change with different mapping tools. - the ``intervals`` taken from ``purecn/out``, as the definition of intervals depend only on the genome & the exome kit, but not on the mapping tool. - the ``container`` taken from ``work/containers/out``, to ensure that the ``PureCN`` version used to compute copy number variants is identical to that used to compute the panel of normals. - diff --git a/docs/step/somatic_cnv_calling.rst b/docs/step/somatic_cnv_calling.rst new file mode 100644 index 000000000..71e502a35 --- /dev/null +++ b/docs/step/somatic_cnv_calling.rst @@ -0,0 +1,7 @@ +.. _step_somatic_cnv_calling:: + +============================ +Somatic Copy Number Variants +============================ + +.. automodule:: snappy_pipeline.workflows.somatic_cnv_calling From 3172ecada0e4ec73680a8196387a52f4b6df5fb4 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 6 Dec 2024 12:37:05 +0100 Subject: [PATCH 21/46] fix: Use cnvkit batch default for WGS --- snappy_pipeline/models/cnvkit.py | 5 +++-- snappy_pipeline/workflows/somatic_cnv_calling/__init__.py | 5 ++++- snappy_wrappers/wrappers/cnvkit/segment/wrapper.py | 5 +++-- .../workflows/test_workflows_somatic_cnv_calling.py | 4 +--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 2063d52f7..6831e5ad4 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -152,11 +152,12 @@ class Fix(SnappyModel): class Segment(SnappyModel): method: SegmentationMethod = SegmentationMethod.CBS """Segmentation method, or 'none' for chromosome arm-level averages as segments""" - threshold: float + threshold: float | None = None """ Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size. + Automatically set to 1e-6 when missing, and in WGS mode. """ drop_outliers: int = 10 """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" @@ -355,7 +356,7 @@ class CnvkitToReference(SnappyModel): class Cnvkit(CnvkitToReference): fix: Fix = Fix() - segment: Segment + segment: Segment = Segment() call: Call = Call() bintest: Bintest = Bintest() diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 8120483ca..b48b224c3 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -878,10 +878,13 @@ def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str # Segmentation parameters args = { "method": self.cfg.segment.method, - "threshold": self.cfg.segment.threshold, "drop-outliers": self.cfg.segment.drop_outliers, "drop-low-coverage": self.cfg.drop_low_coverage, } + if self.cfg.segment.threshold is not None: + args["threshold"] = self.cfg.segment.threshold + elif self.is_wgs: + args["threshold"] = 1e-6 if self.cfg.segment.method == CnvkitSegmentationMethod.CBS: args["smooth-cbs"] = self.cfg.segment.smooth_cbs if self.cfg.somatic_variant_calling.enabled: diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 614e4b458..784d66eb5 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -22,7 +22,7 @@ """.format( snakemake=snakemake, args=args, - zygocity_freq=f"--zygocity_freq {args['zygocity-freq']}" if "zygocity-freq" in args else "" + zygocity_freq=f"--zygocity_freq {args['zygocity-freq']}" if args.get("zygocity-freq", None) is not None else "" ) else: variants = "" @@ -30,7 +30,7 @@ cmd = r""" cnvkit.py segment --processes {snakemake.resources._cores} \ -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ - --method {args[method]} --threshold {args[threshold]} {smooth_cbs} \ + --method {args[method]} {threshold} {smooth_cbs} \ {drop_low_coverage} --drop-outliers {args[drop-outliers]} \ {variants} \ {snakemake.input.ratios} @@ -38,6 +38,7 @@ snakemake=snakemake, args=args, variants=variants, + threshold=f"--threshold {args['threshold']}" if args.get("thresold", None) is not None else "", smooth_cbs="--smooth-cbs" if args.get("smooth-cbs", False) else "", drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index b245f4f35..31382ee88 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -68,8 +68,6 @@ def minimal_config(): enabled: True source: cohort tool: ascat - segment: - threshold: 0.0001 scatter: enabled: true @@ -354,7 +352,7 @@ def test_cnvkit_step_part_get_args_segment(somatic_cnv_calling_workflow): ) expected = { "method": "cbs", - "threshold": 0.0001, + "threshold": 1e-6, "smooth-cbs": False, "drop-low-coverage": False, "drop-outliers": 10, From 2648792108a7cef96672a944b4c6288506223f44 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 6 Dec 2024 12:38:31 +0100 Subject: [PATCH 22/46] refactor: Remove cnvkit from somatic_target_seq_cnv_calling step --- .../__init__.py | 260 ---------- .../somatic_targeted_seq_cnv_calling/model.py | 5 +- ...kflows_somatic_targeted_seq_cnv_calling.py | 465 ------------------ 3 files changed, 1 insertion(+), 729 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 6440fe5f2..bec3d9bd3 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -57,7 +57,6 @@ Available Somatic Targeted CNV Caller ===================================== -- ``cnvkit`` - ``sequenza`` - ``purecn``. Note that ``purecn`` requires a panel of normals and a second set of variants called by ``mutect2``, that includes germline ones. - ``copywriter`` (deprecated, the `R` package was removed with Bioconductor release 3.18) @@ -556,263 +555,6 @@ def get_log_file(self, action): return self._get_log_file_from_prefix(prefix) -class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): - """Perform somatic targeted CNV calling using cnvkit""" - - #: Step name - name = "cnvkit" - - #: Class available actions - actions = ( - "coverage", - "fix", - "segment", - "call", - "postprocess", - "export", - "plot", - "report", - ) - - # Overwrite defaults - default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h - - #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). - resource_usage = { - "plot": ResourceUsage( - threads=1, - time="08:00:00", # 1 day - memory=f"{30 * 1024}M", - ), - "coverage": ResourceUsage( - threads=8, - time="08:00:00", # 8 hours - memory=f"{16 * 1024}M", - ), - } - - def __init__(self, parent): - super().__init__(parent) - - def get_input_files(self, action): - """Return input paths input function, dependent on rule""" - # Validate action - self._validate_action(action) - method_mapping = { - "coverage": self._get_input_files_coverage, - "call": self._get_input_files_call, - "fix": self._get_input_files_fix, - "segment": self._get_input_files_segment, - "postprocess": self._get_input_files_postprocess, - "export": self._get_input_files_export, - "plot": self._get_input_files_plot, - "report": self._get_input_files_report, - } - return method_mapping[action] - - def _get_input_files_coverage(self, wildcards): - # BAM/BAI file - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files = { - "bam": ngs_mapping(base_path + ".bam"), - "bai": ngs_mapping(base_path + ".bam.bai"), - } - return input_files - - @staticmethod - def _get_input_files_fix(wildcards): - tpl_base = "{mapper}.cnvkit.{library_name}" - tpl = "work/" + tpl_base + "/out/" + tpl_base + ".{target}coverage.cnn" - input_files = { - "target": tpl.format(target="target", **wildcards), - "antitarget": tpl.format(target="antitarget", **wildcards), - } - return input_files - - @staticmethod - def _get_input_files_segment(wildcards): - cnr_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" - input_files = {"cnr": cnr_pattern.format(**wildcards)} - return input_files - - @staticmethod - def _get_input_files_call(wildcards): - segment_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" - ) - input_files = {"segment": segment_pattern.format(**wildcards)} - return input_files - - @staticmethod - def _get_input_files_postprocess(wildcards): - segment_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" - ) - call_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns" - ) - input_files = { - "segment": segment_pattern.format(**wildcards), - "call": call_pattern.format(**wildcards), - } - return input_files - - @staticmethod - def _get_input_files_export(wildcards): - cns_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns" - ) - input_files = {"cns": cns_pattern.format(**wildcards)} - return input_files - - @staticmethod - def _get_input_files_plot(wildcards): - tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" - input_files = { - "cnr": tpl.format(ext="cnr", **wildcards), - "cns": tpl.format(ext="call.cns", **wildcards), - } - return input_files - - def _get_input_files_report(self, wildcards): - tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" - input_files = { - "target": tpl.format(ext="targetcoverage.cnn", **wildcards), - "antitarget": tpl.format(ext="antitargetcoverage.cnn", **wildcards), - "cnr": tpl.format(ext="cnr", **wildcards), - "cns": tpl.format(ext="call.cns", **wildcards), - } - return input_files - - def get_output_files(self, action): - """Return output files for the given action""" - if action == "coverage": - return self._get_output_files_coverage() - elif action == "fix": - return self._get_output_files_fix() - elif action == "segment": - return self._get_output_files_segment() - elif action == "call": - return self._get_output_files_call() - elif action == "postprocess": - return self._get_output_files_postprocess() - elif action == "export": - return self._get_output_files_export() - elif action == "plot": - return self._get_output_files_plot() - elif action == "report": - return self._get_output_files_report() - else: - self._validate_action(action) - - @staticmethod - def _get_output_files_coverage(): - name_pattern = "{mapper}.cnvkit.{library_name}" - output_files = {} - for target in ("target", "antitarget"): - output_files[target] = os.path.join( - "work", name_pattern, "out", name_pattern + ".{}coverage.cnn".format(target) - ) - output_files[target + "_md5"] = output_files[target] + ".md5" - return output_files - - @staticmethod - def _get_output_files_fix(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnr") - return {"ratios": tpl, "ratios_md5": tpl + ".md5"} - - @staticmethod - def _get_output_files_segment(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".segment.cns") - return {"segments": tpl, "segments_md5": tpl + ".md5"} - - @staticmethod - def _get_output_files_call(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".call.cns") - return {"calls": tpl, "calls_md5": tpl + ".md5"} - - @staticmethod - def _get_output_files_postprocess(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + "_dnacopy.seg") - return { - "final": tpl, - "final_md5": tpl + ".md5", - } - - @dictify - def _get_output_files_plot(self): - plots = (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - chrom_plots = (("heatmap", "pdf"), ("scatter", "png")) - chroms = list(chain(range(1, 23), ["X", "Y"])) - output_files = {} - # Yield file name pairs for global plots - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" - ) - for plot, ext in plots: - output_files[plot] = tpl.format(plot=plot, ext=ext) - output_files[plot + "_md5"] = output_files[plot] + ".md5" - # Yield file name pairs for the chromosome-wise plots - tpl_chrom = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" - ) - for plot, ext in chrom_plots: - for chrom in chroms: - key = "{plot}_chr{chrom}".format(plot=plot, chrom=chrom) - output_files[key] = tpl_chrom.format(plot=plot, ext=ext, chrom=chrom) - output_files[key + "_md5"] = output_files[key] + ".md5" - return output_files - - @staticmethod - def _get_output_files_export(): - exports = ( - ("bed", "bed.gz"), - ("bed_tbi", "bed.gz.tbi"), - ("seg", "seg"), - ("vcf", "vcf.gz"), - ("vcf_tbi", "vcf.gz.tbi"), - ) - output_files = {} - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/out/" - "{{mapper}}.cnvkit.{{library_name}}.{ext}" - ) - for export, ext in exports: - output_files[export] = tpl.format(export=export, ext=ext) - output_files[export + "_md5"] = output_files[export] + ".md5" - return output_files - - @dictify - def _get_output_files_report(self): - reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - output_files = {} - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{report}.txt" - ) - for report in reports: - output_files[report] = tpl.format(report=report) - output_files[report + "_md5"] = output_files[report] + ".md5" - return output_files - - def get_log_file(self, action): - """Return path to log file for the given action""" - # Validate action - self._validate_action(action) - prefix = ( - "work/{{mapper}}.cnvkit.{{library_name}}/log/" - "{{mapper}}.cnvkit.{action}.{{library_name}}" - ).format(action=action) - return self._get_log_file_from_prefix(prefix) - - class CopywriterStepPart(SomaticTargetedSeqCnvCallingStepPart): """Perform somatic targeted CNV calling using CopywriteR""" @@ -972,7 +714,6 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ( CnvettiOffTargetStepPart, CnvettiOnTargetStepPart, - CnvKitStepPart, CopywriterStepPart, SequenzaStepPart, PureCNStepPart, @@ -992,7 +733,6 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) def get_result_files(self): """Return list of result files for the somatic targeted sequencing CNV calling step""" tool_actions = { - "cnvkit": ["fix", "postprocess", "report", "export"], "sequenza": ("coverage", "run"), "purecn": ("run",), "copywriter": ("call",), diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py index 1fbacf9d7..d9633ef9b 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py @@ -4,11 +4,9 @@ from pydantic import Field from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators -from snappy_pipeline.models.cnvkit import Cnvkit class Tool(enum.StrEnum): - cnvkit = "cnvkit" sequenza = "sequenza" copywriter = "copywriter" cnvetti_on_target = "cnvetti_on_target" @@ -230,10 +228,9 @@ class CnvettiOffTarget(SnappyModel): class SomaticTargetedSeqCnvCalling(SnappyStepModel, validators.ToolsMixin): - tools: Annotated[list[Tool], EnumField(Tool, [Tool.cnvkit], min_length=1)] + tools: Annotated[list[Tool], EnumField(Tool, [Tool.purecn], min_length=1)] path_ngs_mapping: str = "../ngs_mapping" - cnvkit: Cnvkit | None = None sequenza: Sequenza | None = None copywriter: CopyWriter | None = None purecn: PureCn | None = None diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index 815acdf73..f6d5b8bf7 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -41,14 +41,9 @@ def minimal_config(): somatic_targeted_seq_cnv_calling: tools: - cnvetti_on_target - # - cnvkit - copywriter - sequenza - purecn - # cnvkit: - # path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - # path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - # path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn purecn: path_container: /path/to/purecn/container path_intervals: /path/to/interval/list @@ -290,418 +285,6 @@ def test_cnvetti_on_target_step_part_get_resource_usage(somatic_targeted_seq_cnv assert actual == expected, msg_error -# Tests for CnvKitStepPart (coverage) ------------------------------------------------------------- - - -def test_cnvkit_coverage_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards( - fromdict={"mapper": "bwa", "target": "target", "library_name": "P001-T1-DNA1-WGS1"} - ) - expected = { - "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", - "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "coverage")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_coverage_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" - expected = { - "target": base_name_out + ".targetcoverage.cnn", - "target_md5": base_name_out + ".targetcoverage.cnn.md5", - "antitarget": base_name_out + ".antitargetcoverage.cnn", - "antitarget_md5": base_name_out + ".antitargetcoverage.cnn.md5", - } - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "coverage") - - assert actual == expected - - -def test_cnvkit_coverage_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - base_file_name = ( - "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" - ) - expected = get_expected_log_files_dict(base_out=base_file_name) - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_coverage_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'coverage'""" - # Define expected - expected_dict = {"threads": 8, "time": "08:00:00", "memory": "16384M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "coverage", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (fix) ------------------------------------------------------------------ - - -def test_cnvkit_fix_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - coverage_base_out = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1" - expected = { - "antitarget": coverage_base_out + ".antitargetcoverage.cnn", - "target": coverage_base_out + ".targetcoverage.cnn", - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards) - assert actual == expected - - -def test_cnvkit_fix_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" - expected = {"ratios": base_name_out, "ratios_md5": base_name_out + ".md5"} - assert somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "fix") == expected - - -def test_cnvkit_fix_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "fix") - assert actual == expected - - -def test_cnvkit_fix_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource("cnvkit", "fix", resource)() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (segment) -------------------------------------------------------------- - - -def test_cnvkit_segment_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = {"cnr": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"} - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "segment")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_segment_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" - ) - expected = {"segments": base_name_out, "segments_md5": base_name_out + ".md5"} - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "segment") - assert actual == expected - - -def test_cnvkit_segment_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "segment") - assert actual == expected - - -def test_cnvkit_segment_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "segment", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (call) ----------------------------------------------------------------- - - -def test_cnvkit_call_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" - expected = {"segment": segment_file} - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards) - assert actual == expected - - -def test_cnvkit_call_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns" - ) - expected = {"calls": base_name_out, "calls_md5": base_name_out + ".md5"} - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "call") - assert actual == expected - - -def test_cnvkit_call_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.call.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "call") - assert actual == expected - - -def test_cnvkit_call_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "call", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (postprocess) ---------------------------------------------------------- - - -def test_cnvkit_postprocess_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" - call_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - expected = {"segment": segment_file, "call": call_file} - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "postprocess")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_postprocess_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" - expected = { - "final": base_name_out + "_dnacopy.seg", - "final_md5": base_name_out + "_dnacopy.seg.md5", - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "postprocess") - assert actual == expected - - -def test_cnvkit_postprocess_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.postprocess.{library_name}" - ) - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "postprocess") - assert actual == expected - - -def test_cnvkit_postprocess_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'postprocess'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "postprocess", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (plot) ----------------------------------------------------------------- - - -def test_cnvkit_plot_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" - cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - expected = { - "cnr": cnr_file, - "cns": cns_file, - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "plot")(wildcards) - assert actual == expected - - -def test_cnvkit_plot_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = {} - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" - ) - for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")): - expected[plot] = tpl.format(plot=plot, ext=ext) - expected[plot + "_md5"] = expected[plot] + ".md5" - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" - ) - for plot, ext in (("heatmap", "pdf"), ("scatter", "png")): - for chrom in chain(range(1, 23), ("X", "Y")): - key = "{plot}_chr{chrom}".format(plot=plot, chrom=str(chrom)) - expected[key] = tpl.format(plot=plot, ext=ext, chrom=str(chrom)) - expected[key + "_md5"] = expected[key] + ".md5" - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "plot") - assert actual == expected - - -def test_cnvkit_plot_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = get_expected_log_files_dict( - base_out="work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" - ) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "plot") - assert actual == expected - - -def test_cnvkit_plot_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "08:00:00", "memory": "30720M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "plot", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (export) --------------------------------------------------------------- - - -def test_cnvkit_export_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "cns": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "export")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = {} - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" - for key, ext in ( - ("bed", "bed.gz"), - ("bed_tbi", "bed.gz.tbi"), - ("seg", "seg"), - ("vcf", "vcf.gz"), - ("vcf_tbi", "vcf.gz.tbi"), - ): - expected[key] = base_name_out + "." + ext - expected[key + "_md5"] = expected[key] + ".md5" - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "export") - assert actual == expected - - -def test_cnvkit_export_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.export.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "export") - assert actual == expected - - -def test_cnvkit_export_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "export", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (report) --------------------------------------------------------------- - - -def test_cnvkit_report_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" - cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - target_file = ( - "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn" - ) - antitarget_file = ( - "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.antitargetcoverage.cnn" - ) - expected = { - "cnr": cnr_file, - "cns": cns_file, - "target": target_file, - "antitarget": antitarget_file, - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "report")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_report_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = {} - base_name_out = "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}" - for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics"): - expected[report] = base_name_out + "." + report + ".txt" - expected[report + "_md5"] = expected[report] + ".md5" - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_report_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.report.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_report_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "report", resource - )() - assert actual == expected, msg_error - - # Tests for CopywriterStepPart ------------------------------------------------------------------- @@ -1046,7 +629,6 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call expected = [ "cnvetti_off_target", "cnvetti_on_target", - "cnvkit", "copywriter", "link_out", "purecn", @@ -1106,53 +688,6 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call "targets_segmented.txt.md5", ) ] - # cnvkit - # tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="cnvkit") + "{ext}{md5}" - # expected += [ - # tpl.format(i=i, t=t, ext=ext, md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for ext in ( - # ".cnr", - # "_dnacopy.seg", - # ".bed.gz", - # ".bed.gz.tbi", - # ".seg", - # ".vcf.gz", - # ".vcf.gz.tbi", - # ) - # for md5 in ("", ".md5") - # ] - # tpl = ( - # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - # + ".{plot}.{ext}{md5}" - # ) - # expected += [ - # tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - # for md5 in ("", ".md5") - # ] - # tpl = ( - # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - # + ".{plot}.chr{chrom}.{ext}{md5}" - # ) - # expected += [ - # tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) - # for chrom in chain(range(1, 23), ("X", "Y")) - # for md5 in ("", ".md5") - # ] - # tpl = ( - # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - # + ".{report}.txt{md5}" - # ) - # expected += [ - # tpl.format(i=i, t=t, report=report, md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - # for md5 in ("", ".md5") - # ] # copywriter tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="copywriter") + "_{ext}{md5}" expected += [ From 42802e6f298cfc88604821964da27305d449819f Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 11:54:27 +0100 Subject: [PATCH 23/46] refactor: use abstract cnvkit wrapper --- .../wrappers/cnvkit/call/wrapper.py | 120 ++++++++---------- .../wrappers/cnvkit/fix/wrapper.py | 76 +++-------- .../wrappers/cnvkit/segment/wrapper.py | 86 ++++--------- 3 files changed, 95 insertions(+), 187 deletions(-) diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index 987a502ef..c77d8863b 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -1,72 +1,60 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py call""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -center = config["center"] -if center: - if center in set("mean", "median", "mode", "biweight"): - center = " --center " + center - else: - center = " --center-at" + center - -gender = " --gender {}".format(config["gender"]) if config["gender"] else "" -male = " --male-reference" if config["male_reference"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - +import re + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +class CnvkitWrapperCall(CnvkitWrapper): + PURITY_PATTERN = re.compile("^Purity: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") + PLOIDY_PATTERN = re.compile("^Ploidy: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") + + def preamble(self): + if "purity" in self.snakemake.input: + with open(self.snakemake.input.purity, "rt") as f: + for line in f: + m = CnvkitWrapperCall.PURITY_PATTERN.match(line.strip()) + if m: + self.purity = float(m.groups()[1]) + else: + m = CnvkitWrapperCall.PLOIDY_PATTERN.match(line.strip()) + if m: + self.ploidy = float(m.groups()[1]) + else: + self.purity = self.snakemake.params.purity if "purity" in self.snakemake.params else None + self.ploidy = self.snakemake.params.ploidy if "ploidy" in self.snakemake.params else None + + self.cmd = self.cmd.format(purity=self.purity, ploidy=self.ploidy) + +if "variants" in snakemake.input: + variants = r""" + ---vcf {snakemake.input.variants} \ + {snakemake.params.sample_id} {snakemake.params.normal_id} \ + {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + """.format( + snakemake=snakemake, + ) +else: + variants = "" + +cmd = r""" cnvkit.py call \ - --output {snakemake.output.calls} \ - --method {config[calling_method]} \ - --thresholds={config[call_thresholds]} \ - $(if [[ "{config[filter]}" ]]; then \ - echo --filter {config[filter]} - fi) \ - {center} {gender} {male} \ - --ploidy {config[ploidy]} \ - $(if [[ {config[purity]} -gt 0 ]]; then \ - echo --purity {config[purity]} - fi) \ - {snakemake.input} - -d=$(dirname "{snakemake.output.calls}") -pushd $d -fn=$(basename "{snakemake.output.calls}") -md5sum $fn > $fn.md5 -popd -""" + -o {snakemake.output.calls} \ + --method {snakemake.params.method} --thresholds={snakemake.params.thresholds} \ + --filter {snakemake.params.filter} \ + {center} \ + {drop_low_coverage} \ + {sample_sex} {male_reference} \ + {variants} \ + {{purity}} {{ploidy}} \ + {snakemake.input.segments} +""".format( + snakemake=snakemake, + center=f"--center-at {snakemake.params.center_at}" if "center_at" in snakemake.params else f"--center {snakemake.params.center}", + drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", + sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", + male_reference="--male-reference" if snakemake.params.male_reference else "", + variants=variants, ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapperCall(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 537ff5f77..97387dbcb 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -1,68 +1,24 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py fix""" -from snakemake.shell import shell +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -if "ref" in snakemake.input.keys(): - ref = snakemake.input.target -elif "path_panel_of_normals" in config.keys(): - ref = config["path_panel_of_normals"] -else: - raise Exception("Unsupported naming") - -gender = " --gender {}".format(config["gender"]) if config["gender"] else "" -male = " --male-reference" if config["male_reference"] else "" -no_gc = " --no-gc" if not config["gc_correction"] else "" -no_edge = " --no-edge" if not config["edge_correction"] else "" -no_rmask = " --no-rmask" if not config["rmask_correction"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" +cmd = r""" cnvkit.py fix \ - --output {snakemake.output.ratios} \ - {gender} {male} {no_gc} {no_edge} {no_rmask} \ - {snakemake.input.target} \ - {snakemake.input.antitarget} \ - {ref} - -d=$(dirname "{snakemake.output.ratios}") -pushd $d -fn=$(basename "{snakemake.output.ratios}") -md5sum $fn > $fn.md5 -popd -""" + -o {snakemake.output.coverage} \ + {cluster} {snakemake.params.sample_id} \ + {no_gc} {no_edge} {no_rmask} \ + {snakemake.input.target} {antitarget} {snakemake.input.reference} +""".format( + snakemake=snakemake, + cluster="--cluster" if snakemake.params.cluster else "", + no_gc="--no-gc" if snakemake.params.no_gc else "", + no_edge="--no-edge" if snakemake.params.no_edge else "", + no_rmask="--no-rmask" if snakemake.params.no_rmask else "", + antitarget=f"{snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 5a02054e2..648e14a2a 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -1,67 +1,31 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py segment""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -method = config["segmentation_method"] -if method == "cbs" and config["smooth_cbs"]: - method += " --smooth-cbs" - -if float(config["segmentation_threshold"]) > 0: - threshold = " --threshold " + str(config["segmentation_threshold"]) +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +if "variants" in snakemake.input: + variants = r""" + ---vcf {snakemake.input.variants} \ + {snakemake.params.sample_id} {snakemake.params.normal_id} \ + {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + """.format( + snakemake=snakemake, + ) else: - threshold = "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - -cnvkit.py segment \ - --output {snakemake.output.segments} \ - --method {method} \ - $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ - echo --drop-low-coverage - fi) \ - {threshold} \ - --drop-outliers {config[drop_outliers]} \ - {snakemake.input} - -d=$(dirname "{snakemake.output.segments}") -pushd $d -fn=$(basename "{snakemake.output.segments}") -md5sum $fn > $fn.md5 -popd -""" + variants = "" + +cmd = r""" +cnvkit.py segment --processes {snakemake.params.proceses} \ + -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ + --method {snakemake.params.method} --threshold {snakemake.params.threshold} {smooth_cbs} \ + {drop_low_coverage} --drop-outliers {snakemake.params.drop_outliers} \ + {variants} \ + {snakemake.input.coverage} +""".format( + snakemake=snakemake, + smooth_cbs="--smooth-cbs" if snakemake.params.smooth_cbs else "", + drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", + variants=variants, ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() From 4194ed935b4fadaae0a04bac68c88145465a4e0d Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 11:56:17 +0100 Subject: [PATCH 24/46] refactor: mbcs vs somatic combined tool naming --- .../workflows/ngs_mapping/model.py | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index d390b0528..1f64a02be 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -35,6 +35,7 @@ class MetaTool(Enum): ) }, ) +"""DNA mappers or (mbcs) meta-tool""" class Tools(SnappyModel): @@ -149,17 +150,6 @@ class BarcodeTool(Enum): AGENT = "agent" -class Somatic(SnappyModel): - mapping_tool: DnaMapper - """Either bwa of bwa_mem2. The indices & other parameters are taken from mapper config""" - - barcode_tool: BarcodeTool = BarcodeTool.AGENT - """Only agent currently implemented""" - - use_barcodes: bool = False - recalibrate: bool = True - - class Bqsr(SnappyModel): common_variants: str """Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)""" @@ -277,9 +267,13 @@ class Minimap2(SnappyModel): class Mbcs(SnappyModel): mapping_tool: DnaMapper - barcode_tool: BarcodeTool - use_barcodes: bool - recalibrate: bool + """Either bwa of bwa_mem2. The indices & other parameters are taken from mapper config""" + + barcode_tool: BarcodeTool = BarcodeTool.AGENT + """Only agent currently implemented""" + + use_barcodes: bool = False + recalibrate: bool = True class NgsMapping(SnappyStepModel): @@ -321,8 +315,6 @@ class NgsMapping(SnappyStepModel): minimap2: Minimap2 | None = None - mbcs: Mbcs | None = None - @model_validator(mode="after") def ensure_tools_are_configured(self): for data_type in ("dna", "rna", "dna_long"): From e6413305c5389a5cbe9bbd8e6e52e1345dd3e7dc Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 11:58:11 +0100 Subject: [PATCH 25/46] featu: Adding the somatic cnv calling step --- snappy_pipeline/apps/snappy_snake.py | 2 ++ snappy_pipeline/workflow_model.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index f5f8bf3c2..9e99f28f0 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -31,6 +31,7 @@ ngs_mapping, panel_of_normals, repeat_expansion, + somatic_cnv_calling, somatic_cnv_checking, somatic_gene_fusion_calling, somatic_hla_loh_calling, @@ -82,6 +83,7 @@ "ngs_data_qc": ngs_data_qc, "panel_of_normals": panel_of_normals, "repeat_analysis": repeat_expansion, + "somatic_cnv_calling": somatic_cnv_calling, "somatic_cnv_checking": somatic_cnv_checking, "somatic_gene_fusion_calling": somatic_gene_fusion_calling, "somatic_hla_loh_calling": somatic_hla_loh_calling, diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py index aeda7682e..498bb9f93 100644 --- a/snappy_pipeline/workflow_model.py +++ b/snappy_pipeline/workflow_model.py @@ -21,6 +21,7 @@ from snappy_pipeline.workflows.ngs_mapping.model import NgsMapping from snappy_pipeline.workflows.panel_of_normals.model import PanelOfNormals from snappy_pipeline.workflows.repeat_expansion.model import RepeatExpansion +from snappy_pipeline.workflows.somatic_cnv_calling.model import SomaticCnvCalling from snappy_pipeline.workflows.somatic_cnv_checking.model import SomaticCnvChecking from snappy_pipeline.workflows.somatic_gene_fusion_calling.model import SomaticGeneFusionCalling from snappy_pipeline.workflows.somatic_hla_loh_calling.model import SomaticHlaLohCalling @@ -109,6 +110,7 @@ class StepConfig(TypedDict, total=False): ngs_mapping: NgsMapping panel_of_normals: PanelOfNormals repeat_expansion: RepeatExpansion + somatic_cnv_calling: SomaticCnvCalling somatic_cnv_checking: SomaticCnvChecking somatic_gene_fusion_calling: SomaticGeneFusionCalling somatic_hla_loh_calling: SomaticHlaLohCalling From 88c8ae3137b79319ad19da0db091ceb0a86dddf9 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:00:24 +0100 Subject: [PATCH 26/46] refactor: parallel wrapper configuration model --- snappy_pipeline/models/__init__.py | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/snappy_pipeline/models/__init__.py b/snappy_pipeline/models/__init__.py index 282a05ab9..438145235 100644 --- a/snappy_pipeline/models/__init__.py +++ b/snappy_pipeline/models/__init__.py @@ -79,6 +79,46 @@ def keys(self): return self.model_fields.keys() +class Parallel(SnappyModel): + num_cores: int = 2 + """number of cores to use locally""" + + window_length: int = 3500000 + """split input into windows of this size, each triggers a job""" + + num_jobs: int = 500 + """number of windows to process in parallel""" + + use_profile: bool = True + """use Snakemake profile for parallel processing""" + + restart_times: int = 5 + """number of times to re-launch jobs in case of failure""" + + max_jobs_per_second: int = 2 + """throttling of job creation""" + + max_status_checks_per_second: int = 10 + """throttling of status checks""" + + debug_trunc_tokens: int = 0 + """truncation to first N tokens (0 for none)""" + + keep_tmpdir: KeepTmpdir = KeepTmpdir.never + """keep temporary directory, {always, never, onerror}""" + + job_mult_memory: float = 1 + """memory multiplier""" + + job_mult_time: float = 1 + """running time multiplier""" + + merge_mult_memory: float = 1 + """memory multiplier for merging""" + + merge_mult_time: float = 1 + + # This exists to distinguish workflow step_config models from other snappy specific models # It also provides a default_config_yaml_string method that includes the step_config section # by default. From b0e0154e761720a5c5dc139593cfbf7536e005ce Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:02:51 +0100 Subject: [PATCH 27/46] refactor: use abstract cnvkit wrapper --- snappy_wrappers/wrappers/cnvkit/__init__.py | 0 .../wrappers/cnvkit/cnvkit_wrapper.py | 85 +++++++++++++++++++ .../wrappers/cnvkit/environment.yaml | 6 +- 3 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 snappy_wrappers/wrappers/cnvkit/__init__.py create mode 100644 snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py diff --git a/snappy_wrappers/wrappers/cnvkit/__init__.py b/snappy_wrappers/wrappers/cnvkit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py new file mode 100644 index 000000000..a6a8accdf --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py @@ -0,0 +1,85 @@ +"""Abstract wrapper for cnvkit.py""" + +import textwrap + +from snakemake.shell import shell + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + + +class CnvkitWrapper: + header = r""" + # Also pipe everything to log file + if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi + fi + + compute_md5() {{ + fn=$1 + f=$(basename $fn) + d=$(dirname $fn) + pushd $d 1> /dev/null 2>&1 + md5sum $f > $f.md5 + popd 1> /dev/null 2>&1 + }} + + # Write out information about conda installation. + conda list >{snakemake.log.conda_list} + conda info >{snakemake.log.conda_info} + compute_md5 {snakemake.log.conda_list} + compute_md5 {snakemake.log.conda_info} + + # Create temp directory + TMPDIR=$(mktemp -d) + + set -x + + # --------------------------------- Start command ----------------------------------------- + """ + + footer = r""" + # --------------------------------- End command ------------------------------------------- + + for fn in {snakemake.output} + do + compute_md5 $fn + done + compute_md5 {snakemake.log.sh} + """ + + md5_log = r""" + f=$(basename {log}) + d=$(dirname {log}) + pushd $d 1> /dev/null 2>&1 + md5sum $f > $f.md5 + popd 1> /dev/null 2>&1 + """ + + def __init__(self, snakemake, command) -> None: + self.snakemake = snakemake + self.command = command + + def preamble(self): + pass + + def run(self) -> None: + self.preamble() + + with open(self.snakemake.log.sh, "wt") as f: + print( + textwrap.dedent( + "\n".join((CnvkitWrapper.header, self.command, CnvkitWrapper.footer)) + ), + file=f, + ) + + shell(self.snakemake.log.sh) + + shell(CnvkitWrapper.md5_log.format(log=str(self.snakemake.log.log))) diff --git a/snappy_wrappers/wrappers/cnvkit/environment.yaml b/snappy_wrappers/wrappers/cnvkit/environment.yaml index c76be6cb8..5c478d874 100644 --- a/snappy_wrappers/wrappers/cnvkit/environment.yaml +++ b/snappy_wrappers/wrappers/cnvkit/environment.yaml @@ -3,6 +3,6 @@ channels: - bioconda - nodefaults dependencies: - - python==3.10.14 - - cnvkit==0.9.10 - - htslib==1.19.1 + - python=3.12 + - cnvkit==0.9.8 + - htslib=1.21 From bb1961b4c70c09b1f350a5c36ba173f909efac3d Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:04:52 +0100 Subject: [PATCH 28/46] refactor: cnvkit panel of normals with improved logic (WIP) --- .../workflows/panel_of_normals/Snakefile | 48 +- .../workflows/panel_of_normals/__init__.py | 461 ++++++++++++------ .../workflows/panel_of_normals/model.py | 30 +- .../wrappers/cnvkit/access/wrapper.py | 62 +-- .../wrappers/cnvkit/antitarget/wrapper.py | 78 +-- .../wrappers/cnvkit/coverage/wrapper.py | 99 +--- .../wrappers/cnvkit/reference/wrapper.py | 112 +---- .../wrappers/cnvkit/target/wrapper.py | 128 ++--- .../test_workflows_panel_of_normals.py | 253 ++++++---- 9 files changed, 637 insertions(+), 634 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 88e132382..5acd85d39 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -113,6 +113,8 @@ rule panel_of_normals_access_run: partition=wf.get_resource("access", "run", "partition"), log: **wf.get_log_file("access", "run"), + params: + **wf.get_params("access", "run"), wrapper: wf.wrapper_path("cnvkit/access") @@ -120,6 +122,42 @@ rule panel_of_normals_access_run: # Write out the normals-only results for the normals -------------------------- +rule panel_of_normals_cnvkit_access: + input: + unpack(wf.get_input_files("cnvkit", "access")), + output: + **wf.get_output_files("cnvkit", "access"), + threads: wf.get_resource("cnvkit", "access", "threads") + resources: + time=wf.get_resource("cnvkit", "access", "time"), + memory=wf.get_resource("cnvkit", "access", "memory"), + partition=wf.get_resource("cnvkit", "access", "partition"), + log: + **wf.get_log_file("cnvkit", "access"), + params: + **{"args": wf.get_params("cnvkit", "access")}, + wrapper: + wf.wrapper_path("cnvkit/access") + + +rule panel_of_normals_cnvkit_autobin: + input: + unpack(wf.get_input_files("cnvkit", "autobin")), + output: + **wf.get_output_files("cnvkit", "autobin"), + threads: wf.get_resource("cnvkit", "autobin", "threads") + resources: + time=wf.get_resource("cnvkit", "autobin", "time"), + memory=wf.get_resource("cnvkit", "autobin", "memory"), + partition=wf.get_resource("cnvkit", "autobin", "partition"), + log: + **wf.get_log_file("cnvkit", "autobin"), + params: + **{"args": wf.get_params("cnvkit", "autobin")}, + wrapper: + wf.wrapper_path("cnvkit/autobin") + + rule panel_of_normals_cnvkit_target: input: unpack(wf.get_input_files("cnvkit", "target")), @@ -133,7 +171,7 @@ rule panel_of_normals_cnvkit_target: log: **wf.get_log_file("cnvkit", "target"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "target"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/target") @@ -151,7 +189,7 @@ rule panel_of_normals_cnvkit_antitarget: log: **wf.get_log_file("cnvkit", "antitarget"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "antitarget"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/antitarget") @@ -169,7 +207,7 @@ rule panel_of_normals_cnvkit_coverage: log: **wf.get_log_file("cnvkit", "coverage"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "coverage"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/coverage") @@ -190,7 +228,7 @@ rule panel_of_normals_cnvkit_create_panel: log: **wf.get_log_file("cnvkit", "create_panel"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "create_panel"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/reference") @@ -208,7 +246,7 @@ rule panel_of_normals_cnvkit_report: log: **wf.get_log_file("cnvkit", "report"), params: - args=wf.substep_dispatch("cnvkit", "get_args", "report"), + **{"args": wf.get_params("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/report") diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index d99755214..77d76abc9 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -68,12 +68,10 @@ .. note:: In a nutshell, for exome data, the accessibility file is only used to create antitarget regions. - For genome data, it is used by the ``autobin`` tool to compute the average target size used during target regions creation. - If it is present, the target size is computed in amplicon mode, and when it is absent, - an accessibility file is created with default settings, which value is used by ``autobin`` is whole genome mode. + These regions are essentially the accessible regions minus the target regions (with edge effect correction). -To generate the access file from a bed file containing regions to exclude from further coverage computations, -the user must proceed in two steps: +Access files can be generated from the genome reference ``fasta`` file, and optionally ``bed`` file(s) containing regions to exclude from further computations. +In this case, the user must proceed in two steps: First, she needs to run the ``access`` tool to create the desired access file @@ -86,6 +84,8 @@ This will create ``output/cnvkit.access/out/cnvkit.access.bed`` from the genomic sequence & excluded regions. +When there are no exclusion regions, the access file is automatically created using only the reference genome, and removing masked regions. + ------------------------ Panel of normal creation ------------------------ @@ -119,6 +119,28 @@ The cnvkit authors recommend to check these reports to ensure that all data is suitable for panel of normal creation. +---------------------- +Notes on the algorithm +---------------------- + +The choice of steps depends on the library type: whole exome sequencing is different from whole genome sequencing (panel not implemented yet). + +The reference is assembled on coverage computed for all normal samples. +The coverage is always computed on target regions, and separately on antitarget regions only for exome data, not for whole genome or panel data. + +For exome and panel data, target regions are obtained from the baits bed file, adding gene information & edge effects correction in the case of exome data. +For WGS data, the target regions are the full accessible regions in the genome. The user can define those accessible region (using ``access``). +But when she has left this option empty, the accessible regions are automatically defined based on the reference genome. + +To create the target regions from the baits (or from the accessible regions), the target average bin size must be set. +There is a reasonable default value for exome data, but an additional ``autobin`` step is required for the whole genome data. +In ``batch`` mode, this value is computed from the coverage over the full genome +.. note:: + + The ``cnvkit batch`` command also allows the creation of a flat reference, when there are no normal samples. + This is not implemented in the ``panel_of_normals`` step, for obvious reasons. + Using a flat reference for CNV computations is nevertheless possible, it is implemented in the ``somatic_cnv_calling`` step. + ================ Notes ``purecn`` ================ @@ -147,6 +169,8 @@ """ +from enum import StrEnum + from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions from snappy_pipeline.utils import dictify, listify @@ -169,6 +193,13 @@ DEFAULT_CONFIG = PanelOfNormalsConfigModel.default_config_yaml_string() +#: Known library types +class LibraryType(StrEnum): + WES = "WES" + WGS = "WGS" + Panel = "Panel" + + class PanelOfNormalsStepPart(BaseStepPart): """Base class for panel of normals step parts @@ -182,29 +213,65 @@ class PanelOfNormalsStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) # Build shortcut from cancer bio sample name to matched cancer sample - self.normal_libraries = list(self._get_normal_libraries()) + known_libraries = self._get_normal_libraries() + self.normal_libraries = list(known_libraries.keys()) if self.name and (cfg := self.config.get(self.name)): if path := cfg.get("path_normals_list"): self.normal_libraries = [] with open(path, "rt") as f: for line in f: + if line.startswith("#"): + continue self.normal_libraries.append(line.strip()) + self.libraryType, self.libraryKit = self._validate_normal_libraries(known_libraries) def _get_normal_libraries(self): + normal_libraries = {} for sheet in self.parent.shortcut_sheets: for donor in sheet.donors: - for _, bio_sample in donor.bio_samples.items(): + for bio_sample in donor.bio_samples.values(): if bio_sample.is_tumor: continue - for _, test_sample in bio_sample.test_samples.items(): + for test_sample in bio_sample.test_samples.values(): extraction_type = test_sample.extra_infos.get("extractionType", "DNA") if extraction_type.lower() == "dna": - for _, ngs_library in test_sample.ngs_libraries.items(): - yield ngs_library.name + for library in test_sample.ngs_libraries.values(): + normal_libraries[library.name] = self._get_extra_info(library) + return normal_libraries + + def _validate_normal_libraries(self, known_libraries): + libraryType = None + libraryKit = None + for library in self.normal_libraries: + assert ( + library in known_libraries + ), f"Unknown normal library {library} requested to build pon" + assert ( + libraryType is None or libraryType == known_libraries[library]["libraryType"] + ), "Panel of normal cannot be built from multiple library types" + libraryType = known_libraries[library]["libraryType"] + if libraryType == LibraryType.WES: + assert ( + libraryKit is None or libraryKit == known_libraries[library]["libraryKit"] + ), "Panel of normal cannot be built from multiple library kits" + libraryKit = known_libraries[library]["libraryKit"] + return (libraryType, libraryKit) + + @staticmethod + def _get_extra_info(library): + extra_info = {} + assert "libraryType" in library.extra_infos, f"Undefined type of library {library.name}" + extra_info["libraryType"] = library.extra_infos.get("libraryType", "Illumina") + if extra_info["libraryType"] == LibraryType.WES: + assert ( + "libraryKit" in library.extra_infos + ), f"Undefined exome kit for library {library.name}" + extra_info["libraryKit"] = library.extra_infos.get("libraryKit", "__default__") + return extra_info @staticmethod @dictify - def _get_log_file(tpl): + def _get_log_file(tpl, has_sh=False): """Return all log files files""" ext_dict = { "conda_list": "conda_list.txt", @@ -214,6 +281,9 @@ def _get_log_file(tpl): "log": "log", "log_md5": "log.md5", } + if has_sh: + ext_dict["sh"] = "sh" + ext_dict["sh_md5"] = ext_dict["sh"] + ".md5" for key, ext in ext_dict.items(): yield key, tpl + "." + ext @@ -288,6 +358,8 @@ def _get_input_files_create(self, wildcards): def get_output_files(self, action): self._validate_action(action) + if self.name not in self.config.tools: + return {} if action == "install": return {"container": "work/containers/out/purecn.simg"} @@ -320,6 +392,9 @@ def get_output_files(self, action): } def get_log_file(self, action): + if self.name not in self.config.tools: + return {} + tpls = { "install": "work/containers/log/purecn", "prepare": "work/purecn/log/{}_{}".format( @@ -422,13 +497,15 @@ def get_log_file(cls, action): class CnvkitStepPart(PanelOfNormalsStepPart): - """Somatic variant calling with MuTect 2""" + """Build reference covergage for cnvkit""" #: Step name name = "cnvkit" #: Class available actions actions = ( + "access", + "autobin", "target", "antitarget", "coverage", @@ -467,7 +544,6 @@ class CnvkitStepPart(PanelOfNormalsStepPart): def __init__(self, parent): super().__init__(parent) - self.is_wgs = self.config.cnvkit.path_target_regions == "" def check_config(self): if self.name not in self.config.tools: @@ -477,48 +553,173 @@ def check_config(self): "Path to reference FASTA not configured but required for %s" % (self.name,), ) - def get_args(self, action): - self._validate_action(action) - if self.is_wgs: - method = "wgs" - else: - method = "hybrid" - return {"method": method, "flat": (len(self.normal_libraries) == 0)} - def get_input_files(self, action): """Return input files for cnvkit panel of normals creation""" # Validate action self._validate_action(action) mapping = { + "access": self._get_input_files_access, + "autobin": self._get_input_files_autobin, "target": self._get_input_files_target, "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, "create_panel": self._get_input_files_create_panel, "report": self._get_input_files_report, - "access": None, } return mapping[action] - def _get_input_files_target(self, wildcards): - """Helper wrapper function to estimate target average size in wgs mode""" - if not self.is_wgs: - return {} + def get_params(self, action): + """Return panel of normal files""" + if action == "access": + return self._get_params_access + elif action == "autobin": + return self._get_params_autobin + elif action == "target": + return self._get_params_target + elif action == "antitarget": + return self._get_params_antitarget + elif action == "coverage": + return self._get_params_coverage + elif action == "create_panel": + return self._get_params_create_panel + elif action == "report": + return self._get_params_report + else: + self._validate_action(action) + + def get_output_files(self, action): + """Return panel of normal files""" + if action == "access": + return self._get_output_files_access() + elif action == "autobin": + return self._get_output_files_autobin() + elif action == "target": + return self._get_output_files_target() + elif action == "antitarget": + return self._get_output_files_antitarget() + elif action == "coverage": + return self._get_output_files_coverage() + elif action == "create_panel": + return self._get_output_files_create_panel() + elif action == "report": + return self._get_output_files_report() + else: + self._validate_action(action) + + @classmethod + def get_log_file(cls, action): + """Return panel of normal files""" + tpls = { + "access": "work/{mapper}.cnvkit/log/cnvkit.access", + "autobin": "work/{mapper}.cnvkit/log/cnvkit.autobin", + "target": "work/{mapper}.cnvkit/log/cnvkit.target", + "antitarget": "work/{mapper}.cnvkit/log/cnvkit.antitarget", + "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage", + "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", + "report": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report", + } + assert action in cls.actions + return cls._get_log_file(tpls[action], has_sh=True) + + def _get_input_files_access(self, wildcards): + return {} + + def _get_params_access(self, wildcards): + return {"reference": self.w_config.static_data_config.reference.path} + + def _get_output_files_access(self): + return {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"} + + def _get_input_files_autobin(self, wildcards): + assert ( + self.libraryType == LibraryType.WGS + ), "Trying to estimate average target size for non-WGS samples" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" bams = [ ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) for x in self.normal_libraries ] - bais = [x + ".bai" for x in bams] - input_files = {"bams": bams, "bais": bais} + input_files = {"bams": bams} + if self.config.cnvkit.get("access", "") == "": + input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) + return input_files + + def _get_params_autobin(self, wildcards): + assert ( + self.libraryType == LibraryType.WGS + ), "Trying to estimate average target size for non-WGS samples" + params = {} + if self.name in self.config.tools and self.config.cnvkit: + if self.config.cnvkit.get("access", "") == "": + params["method"] = "wgs" + else: + params["method"] = "amplicon" + params["target"] = self.config.cnvkit.get("access") + return params + + def _get_output_files_autobin(self): + return { + "result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt", + "target": "$TMPDIR/{mapper}.targets.bed", + "antitarget": "$TMPDIR/{mapper}.antitarget.bed", + } + + def _get_input_files_target(self, wildcards): + """Helper wrapper function to estimate target average size in wgs mode""" + input_files = {} + if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": + input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) + if self.config.cnvkit.get("target_avg_size", None) is None: + input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( + **wildcards + ) return input_files + def _get_params_target(self, wildcards): + params = {} + if self.name in self.config.tools: + if self.libraryType == LibraryType.WES: + params["target"] = self.config.cnvkit.path_target_regions + if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": + params["target"] = self.config.cnvkit.get("access") + if "features" in self.w_config.static_data_config: + params["annotate"] = self.w_config.static_data_config.features.path + if self.config.cnvkit.get("split", False): + params["split"] = True + if self.config.cnvkit.get("target_avg_size", None): + params["avg_size"] = self.config.cnvkit.get("target_avg_size") + return params + + def _get_output_files_target(self): + return { + "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", + } + def _get_input_files_antitarget(self, wildcards): """Helper wrapper function for computing antitarget locations""" - if self.is_wgs: + if self.libraryType == LibraryType.WGS: return {} return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), + "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed".format(**wildcards), + } + + def _get_params_antitarget(self, widlcards): + params = {} + if self.name in self.config.tools: + params = { + "avg_size": self.config.cnvkit.antitarget_avg_size, + "min_size": self.config.cnvkit.min_size, + } + if self.config.cnvkit.get("access", "") == "": + params["access"] = self.config.cnvkit.get("access") + return params + + def _get_output_files_antitarget(self): + return { + "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", } def _get_input_files_coverage(self, wildcards): @@ -527,42 +728,73 @@ def _get_input_files_coverage(self, wildcards): tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" bam = ngs_mapping(tpl.format(**wildcards)) return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format( - **wildcards - ), + "intervals": "work/{mapper}.cnvkit/out/cnvkit.{interval}.bed".format(**wildcards), "bam": bam, "bai": bam + ".bai", } + def _get_params_coverage(self, wildcards): + params = {} + if self.name in self.config.tools: + params = { + "reference": self.w_config.static_data_config.reference.path, + "min_mapq": self.config.cnvkit.min_mapq, + } + if self.config.cnvkit.get("count", False): + params["count"] = True + return params + + def _get_output_files_coverage(self): + return { + "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", + "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", + } + def _get_input_files_create_panel(self, wildcards): - """Helper wrapper function for computing panel of normals""" tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" targets = [ tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" - antitargets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries - ] - tpl = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage.{ext}" - logs = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x, ext=ext) - for x in self.normal_libraries - for ext in ("log", "conda_list.txt", "conda_info.txt") - ] + if self.libraryType == LibraryType.WES: + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + antitargets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) + for x in self.normal_libraries + ] + else: + antitargets = [] + return {"references": targets + antitargets} + + def _get_params_create_panel(self, wildcards): + params = {} + if self.name in self.config.tools: + params = { + "reference": self.w_config.static_data_config.reference.path, + } + if self.config.cnvkit.get("cluster", False): + params["cluster"] = True + params["min_cluster_size"] = self.config.cnvkit.min_cluster_size + if "sample_sex" in self.config.cnvkit: + params["sample_sex"] = self.config.cnvkit.gender + if self.config.cnvkit.get("male_reference", False): + params["male_reference"] = True + if self.config.cnvkit.get("diploid_parx_genome", None): + params["diploid_parx_genome"] = self.config.cnvkit.get("diploid_parx_genome") + if not self.config.cnvkit.get("gc_correction", True): + params["no_gc"] = True + if not self.config.cnvkit.get("rmask_correction", True): + params["no_rmask"] = True + if self.config.cnvkit.get("edge_correction", None) is None: + if self.libraryType != LibraryType.WES: + params["no_edge"] = True + elif not self.config.cnvkit.get("edge_correction"): + params["no_edge"] = True + return params + + def _get_output_files_create_panel(self): return { - "target": ( - targets - if targets - else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards) - ), - "antitarget": ( - antitargets - if antitargets - else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format(**wildcards) - ), - "logs": logs if targets or antitargets else [], + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", } def _get_input_files_report(self, wildcards): @@ -580,51 +812,6 @@ def _get_input_files_report(self, wildcards): "antitarget": antitargets, } - def get_output_files(self, action): - """Return panel of normal files""" - if action == "target": - return self._get_output_files_target() - elif action == "antitarget": - return self._get_output_files_antitarget() - elif action == "coverage": - return self._get_output_files_coverage() - elif action == "create_panel": - return self._get_output_files_create_panel() - elif action == "report": - return self._get_output_files_report() - elif action == "access": - return self._get_output_files_access() - else: - self._validate_action(action) - - def _get_output_files_target(self): - return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", - } - - def _get_output_files_antitarget(self): - return { - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", - } - - def _get_output_files_coverage(self): - return { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", - } - - def _get_output_files_create_panel(self): - return { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - "log": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz", - "log_md5": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz.md5", - } - def _get_output_files_report(self): return { "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", @@ -633,26 +820,6 @@ def _get_output_files_report(self): "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", } - def _get_output_files_access(self): - return { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", - } - - @classmethod - def get_log_file(cls, action): - """Return panel of normal files""" - tpls = { - "target": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target", - "antitarget": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget", - "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage", - "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", - "report": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report", - "access": "work/cnvkit.access/log/cnvkit.access", - } - assert action in cls.actions - return cls._get_log_file(tpls[action]) - class AccessStepPart(PanelOfNormalsStepPart): """Utility to create access file for cnvkit""" @@ -677,14 +844,25 @@ def get_input_files(self, action): def get_output_files(self, action): # Validate action self._validate_action(action) - tpl = "work/cnvkit.access/out/cnvkit.access.bed" + tpl = "work/access/out/access.bed" return {"access": tpl, "access_md5": tpl + ".md5"} + def get_params(self, action): + # Validate action + self._validate_action(action) + if self.name in self.config.tools: + return { + "reference": self.w_config.static_data_config.reference.path, + "min_gap_size": self.config.access.min_gap_size, + "exclude": self.config.access.exclude, + } + return {} + @classmethod def get_log_file(cls, action): """Return log files""" assert action in cls.actions - return cls._get_log_file("work/cnvkit.access/log/cnvkit.access") + return cls._get_log_file("work/access/log/access", has_sh=True) class PanelOfNormalsWorkflow(BaseStep): @@ -757,39 +935,38 @@ def get_result_files(self): if "cnvkit" in set(self.config.tools) & set(TOOLS): tpls = [ - ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.target.{ext}", ("bed", "bed.md5")), - ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed", "bed.md5")), ( "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", ("cnn", "cnn.md5"), ), - ( - "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", - ("tsv", "tsv.md5"), - ), - ( - "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", - ("tsv", "tsv.md5"), - ), + # ( + # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", + # ("tsv", "tsv.md5"), + # ), + # ( + # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", + # ("tsv", "tsv.md5"), + # ), ] for tpl, ext_list in tpls: result_files.extend(self._expand_result_files(tpl, ext_list)) tpls = [ - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.target.{ext}", - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget.{ext}", + "output/{mapper}.cnvkit/log/cnvkit.target.{ext}", + "output/{mapper}.cnvkit/log/cnvkit.antitarget.{ext}", "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.report.{ext}", ] for tpl in tpls: - result_files.extend(self._expand_result_files(tpl, log_ext_list)) - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" - result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) + # tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" + # result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) if "access" in set(self.config.tools) & set(TOOLS): - tpl = "output/cnvkit.access/out/cnvkit.access.bed" + tpl = "output/access/out/access.bed" result_files.extend([tpl + md5 for md5 in ("", ".md5")]) - tpl = "output/cnvkit.access/log/cnvkit.access.{ext}" - result_files.extend(self._expand_result_files(tpl, log_ext_list)) + tpl = "output/access/log/access.{ext}" + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) if "purecn" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.{ext}" diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index ec68233e3..b7995e32c 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -60,6 +60,11 @@ class Mutect2(SnappyModel): """running time multiplier for merging""" +class CnvkitSex(enum.StrEnum): + MALE = "male" + FEMALE = "female" + + class CnvKit(SnappyModel): path_normals_list: str = "" """Optional file listing libraries to include in panel""" @@ -70,22 +75,19 @@ class CnvKit(SnappyModel): access: str = "" """Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run)""" - annotate: str = "" - """[target] Optional targets annotations""" + target_avg_size: float | None = None + """[target] Average size of split target bins (None: use default value, or use autobin for wgs)""" - target_avg_size: int = 0 - """[target] Average size of split target bins (0: use default value)""" + split: bool = False + """[target] Split large intervals into smaller ones""" - bp_per_bin: int = 50000 + bp_per_bin: float = 50000 """[autobin] Expected base per bin""" - split: bool = True - """[target] Split large intervals into smaller ones""" - - antitarget_avg_size: int = 0 + antitarget_avg_size: float = 0 """[antitarget] Average size of antitarget bins (0: use default value)""" - min_size: int = 0 + min_size: float = 0 """[antitarget] Min size of antitarget bins (0: use default value)""" min_mapq: int = 0 @@ -97,7 +99,7 @@ class CnvKit(SnappyModel): min_cluster_size: int = 0 """[reference] Minimum cluster size to keep in reference profiles. 0 for no clustering""" - gender: str = "" + sample_sex: CnvkitSex | None = None """[reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing""" male_reference: bool = False @@ -106,8 +108,8 @@ class CnvKit(SnappyModel): gc_correction: bool = True """[reference] Use GC correction""" - edge_correction: bool = True - """[reference] Use edge correction""" + edge_correction: bool | None = None + """[reference] Use edge correction (automatic when None, edge correction for WES only)""" rmask_correction: bool = True """[reference] Use rmask correction""" @@ -207,6 +209,6 @@ class PanelOfNormals(SnappyStepModel, validators.ToolsMixin): cnvkit: CnvKit | None = None - access: Access = Access() + access: Access | None = None purecn: PureCn | None = None diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index fa483d41f..5954500e9 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -1,57 +1,29 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py access""" -from snakemake.shell import shell +import os +import sys -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) -config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["access"] +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -exclude = " --exclude " + " -x ".join(config["exclude"]) if config["exclude"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" +cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ - $(if [[ {config[min_gap_size]} -gt 0 ]]; then \ - echo --min-gap-size {config[min_gap_size]} - fi) \ + --min-gap-size {snakemake.params.min_gap_size} \ {exclude} \ - {snakemake.config[static_data_config][reference][path]} - -fn=$(basename "{snakemake.output.access}") -d=$(dirname "{snakemake.output.access}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" + {snakemake.params.reference} +""".format( + snakemake=snakemake, + exclude=" ".join([f"--exclude {x}" for x in snakemake.params.exclude]) if snakemake.params.exclude else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index e79639a3b..da3b440f0 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -1,68 +1,20 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py antitarget""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -target = snakemake.input.get("target", "") - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - -if [[ -n "{config[path_target_regions]}" ]] -then - cnvkit.py antitarget \ - --output {snakemake.output.antitarget} \ - $(if [[ -n "{config[access]}" ]]; then \ - echo --access {config[access]} - fi) \ - $(if [[ {config[antitarget_avg_size]} -gt 0 ]]; then \ - echo --avg-size {config[antitarget_avg_size]} - fi) \ - $(if [[ {config[min_size]} -gt 0 ]]; then \ - echo --min-size {config[min_size]} - fi) \ - {target} -else - touch {snakemake.output.antitarget} -fi - -fn=$(basename "{snakemake.output.antitarget}") -d=$(dirname "{snakemake.output.antitarget}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +cmd = r""" +cnvkit.py antitarget \ + -o {snakemake.output.region} \ + --avg-size {snakemake.params.avg_size} --min-size {snakemake.params.min_size} \ + {access} \ + {snakemake.input.target} +""".format( + snakemake=snakemake, + access=f"--access {snakemake.params.access}" if snakemake.params.access else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index b07d4536a..678a7e348 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -1,89 +1,20 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py coverage""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -# During panel_of_normals step, the target regions are created by the target substep. -# During somatic CNV calling (both exome & wgs), the target regions are obtained from the configuration -if "target" in snakemake.input.keys(): - target = snakemake.input.target -elif "path_target" in config.keys(): - target = config["path_target"] -else: - raise Exception("Unsupported naming") - -# Same for antitarget regions -if "antitarget" in snakemake.input.keys(): - antitarget = snakemake.input.antitarget -elif "path_antitarget" in config.keys(): - antitarget = config["path_antitarget"] -else: - raise Exception("Unsupported naming") - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# Function definitions --------------------------------------------------------- - -coverage() -{{ - cnvkit.py coverage \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - --min-mapq {config[min_mapq]} \ - --processes {snakemake.threads} \ - {snakemake.input.bam} \ - --output $2 $1 -}} - -md5() -{{ - set -x - - fn=$1 - f=$(basename $fn) - d=$(dirname $fn) - pushd $d - md5sum $f > $f.md5 - popd -}} - -# ----------------------------------------------------------------------------- - -coverage {target} {snakemake.output.target} -md5 {snakemake.output.target} - -coverage {antitarget} {snakemake.output.antitarget} -md5 {snakemake.output.antitarget} -""" +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +cmd = r""" +cnvkit.py coverage --processes {snakemake.params.processes} \ + -o {snakemake.output.coverage} \ + --fasta {snakemake.params.reference} + --min-mapq {snakemake.params.min_mapq} {count} \ + {snakemake.input.bam} {snakemake.input.intervals} +""".format( + snakemake=snakemake, + count="--count" if snakemake.params.count else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 57b8ee02e..4d53e9508 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -1,90 +1,32 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py reference""" -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -# NOTE: snakemake.input.target and snakemake.input.antitarget contain -# the output of target & antitarget substeps when there is no bam files -# the bam files lists when the list of normals is not empty - -cluster = ( - " --cluster --min-cluster-size {}".format(config["min_cluster_size"]) - if config["min_cluster_size"] > 0 - else "" +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +cmd = r""" +cnvkit.py reference \ + -o {snakemake.output.reference} \ + --fasta {snakemake.params.reference} \ + {cluster} {min_cluster_size} \ + {sample_sex} {male_reference} {diploid_parx_genome} \ + {no_gc} {no_edge} {no_rmask} \ + {target} {antitarget} {normals} +""".format( + snakemake=snakemake, + cluster="--cluster" if snakemake.params.cluster else "", + min_cluster_size=f"--min-cluster-size {snakemake.params.min_cluster_size}" if snakemake.params.cluster and "min_cluster_size" in snakemake.params else "", + no_gc="--no-gc" if snakemake.params.no_gc else "", + no_edge="--no-edge" if snakemake.params.no_edge else "", + no_rmask="--no-rmask" if snakemake.params.no_rmask else "", + sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", + male_reference="--male-reference" if snakemake.params.male_reference else "", + diploid_parx_genome=f"--diploid_parx_genome {snakemake.params.diploid_parx_genome}" if "diploid_parx_genome" in snakemake.params else "", + target=f"--target {snakemake.input.target}" if "target" in snakemake.input else "", + antitarget=f"--antitarget {snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", + normals=" ".join(snakemake.input.normals) if "normals" in snakemake.input else "", ) -gender = " --gender {}".format(config["gender"]) if config["gender"] else "" -male = " --male-reference" if config["male_reference"] else "" -no_gc = " --no-gc" if not config["gc_correction"] else "" -no_edge = " --no-edge" if not config["edge_correction"] or not config["path_target_regions"] else "" -no_rmask = " --no-rmask" if not config["rmask_correction"] else "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x -# ----------------------------------------------------------------------------- - -if [[ "{snakemake.params.args[flat]}" = "True" ]] -then - cnvkit.py reference \ - --output {snakemake.output.panel} \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ - --targets {snakemake.input.target} --antitargets {snakemake.input.antitarget} -else - cnvkit.py reference \ - --output {snakemake.output.panel} \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ - {snakemake.input.target} {snakemake.input.antitarget} -fi - -if [[ -n "{snakemake.input.logs}" ]] -then - tar -zcvf {snakemake.output.log} {snakemake.input.logs} -else - touch {snakemake.output.log} -fi - -fn=$(basename "{snakemake.output.panel}") -d=$(dirname "{snakemake.output.panel}") -pushd $d -md5sum $fn > $fn.md5 -popd -fn=$(basename "{snakemake.output.log}") -d=$(dirname "{snakemake.output.log}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" -) - -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index acfc41ff6..457718684 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -1,108 +1,46 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py target""" -from snakemake.shell import shell +import os +import re +import sys -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -bams = " ".join(snakemake.input.get("bams", [""])) +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi +# WGS: targets are all accessible regions, WES: targets are baits +interval = snakemake.input.access if "access" in snakemake.input else snakemake.params.target -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +if "avg_size" in snakemake.input: + pattern = re.compile("^Target:[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)$") + with open(snakemake.input.avg_size) as f: + for line in f: + m = pattern.match(line) + if m: + avg_size = float(m.groups()[4]) + break -set -x - -# ----------------------------------------------------------------------------- - -access() -{{ - cnvkit.py access \ - -o $tmpdir/access.bed \ - {snakemake.config[static_data_config][reference][path]} -}} - -# ----------------------------------------------------------------------------- - -target="{config[path_target_regions]}" -target_avg_size={config[target_avg_size]} - -if [[ -z "$target" ]] && [[ $target_avg_size -eq 0 ]] -then - tmpdir=$(mktemp -d) - - if [[ -n "{bams}" ]] - then - access - cnvkit.py autobin --method wgs \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - --access $tmpdir/access.bed \ - --bp-per-bin {config[bp_per_bin]} \ - --target-output-bed $tmpdir/target.bed --antitarget-output-bed $tmpdir/antitarget.bed \ - {bams} > $tmpdir/autobin.txt - target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) - - if [[ -z "{config[access]}" ]] - then - target=$tmpdir/access.bed - else - target="{config[access]}" - fi - else - if [[ -z "{config[access]}" ]] - then - access - target=$tmpdir/access.bed - else - target="{config[access]}" - fi - target_avg_size=5000 - fi -fi +else: + avg_size = snakemake.params.avg_size +cmd = r""" cnvkit.py target \ - --output {snakemake.output.target} \ - $(if [[ -n "{config[annotate]}" ]]; then \ - echo --short-names --annotate {config[annotate]} - fi) \ - $(if [[ "{config[split]}" = "True" ]]; then \ - echo --split - fi) \ - $(if [[ $target_avg_size -gt 0 ]]; then \ - echo --avg-size $target_avg_size - fi) \ - $target - -fn=$(basename "{snakemake.output.target}") -d=$(dirname "{snakemake.output.target}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" + -o {snakemake.output.target} \ + {avg_size} {split} \ + {interval} +""".format( + snakemake=snakemake, + interval=interval, + avg_size=f"--avg-size {avg_size}", + split=f"--split" if snakemake.params.split else "", ) -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) +CnvkitWrapper(snakemake, cmd).run() diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 12cf27c90..c596a734b 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -27,6 +27,8 @@ def minimal_config(): path: /path/to/cosmic.vcf.gz dbsnp: path: /path/to/dbsnp.vcf.gz + features: + path: /path/to/annotations.gtf step_config: ngs_mapping: @@ -42,13 +44,15 @@ def minimal_config(): germline_resource: /path/to/germline_resource.vcf path_normals_list: "" cnvkit: - path_target_regions: /path/to/regions.bed # WES mode + path_target_regions: "" path_normals_list: "" purecn: path_normals_list: "" path_bait_regions: /path/to/baits/regions.bed path_genomicsDB: /path/to/mutect2/genomicsDB genome_name: "unknown" + access: + exclude: [/path/to/exclude.bed] data_sets: first_batch: @@ -198,6 +202,34 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): # Tests for CnvkitStepPart ------------------------------------------------------------------------ +def test_cnvkit_step_part_get_input_files_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards) + assert actual == {} + + +def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "bams": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) + assert actual == expected + + def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): """Tests CnvkitStepPart._get_input_files_target()""" wildcards = Wildcards( @@ -205,8 +237,12 @@ def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): "mapper": "bwa", } ) + expected = { + "access": "work/bwa.cnvkit/out/cnvkit.access.bed", + "avg_size": "work/bwa.cnvkit/out/cnvkit.autobin.txt", + } actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) - assert actual == {} + assert actual == expected def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): @@ -216,11 +252,8 @@ def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): "mapper": "bwa", } ) - expected = { - "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", - } actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) - assert actual == expected + assert actual == {} def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): @@ -229,13 +262,13 @@ def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): fromdict={ "mapper": "bwa", "normal_library": "P001-N1-DNA1-WGS1", + "interval": "target", } ) expected = { "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", - "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", + "intervals": "work/bwa.cnvkit/out/cnvkit.target.bed", } actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected @@ -249,53 +282,92 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow } ) expected = { - "target": [ + "references": [ "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - "logs": [ - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_info.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_info.txt", - ], } actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_report()""" +def test_cnvkit_step_part_get_params_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_access()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = { - "target": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) + expected = {"reference": "/path/to/ref.fa"} + actual = panel_of_normals_workflow.get_params("cnvkit", "access")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_autobin()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"method": "wgs"} + actual = panel_of_normals_workflow.get_params("cnvkit", "autobin")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"annotate": "/path/to/annotations.gtf"} + actual = panel_of_normals_workflow.get_params("cnvkit", "target")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_antitarget()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"avg_size": 0, "min_size": 0} + actual = panel_of_normals_workflow.get_params("cnvkit", "antitarget")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"reference": "/path/to/ref.fa", "min_mapq": 0} + actual = panel_of_normals_workflow.get_params("cnvkit", "coverage")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_params_create_panel(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_params_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = {"reference": "/path/to/ref.fa", "no_edge": True} + actual = panel_of_normals_workflow.get_params("cnvkit", "create_panel")(wildcards) assert actual == expected def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_target()""" expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", + "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") assert actual == expected @@ -304,8 +376,8 @@ def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_antitarget()""" expected = { - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", + "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") assert actual == expected @@ -314,10 +386,8 @@ def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow) def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_coverage()""" expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", + "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") assert actual == expected @@ -328,45 +398,37 @@ def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflo expected = { "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - "log": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz", - "log_md5": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") assert actual == expected -def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_report()""" - expected = { - "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", - "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", - "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", - "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") - assert actual == expected - - def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_target()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" + base_name_out = "work/{mapper}.cnvkit/log/cnvkit.target" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.target.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") assert actual == expected def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_antitarget()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" + base_name_out = "work/{mapper}.cnvkit/log/cnvkit.antitarget" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.antitarget.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") assert actual == expected def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_coverage()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") assert actual == expected @@ -375,18 +437,12 @@ def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_create_panel()""" base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" expected = get_expected_log_files_dict(base_out=base_name_out) + expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") assert actual == expected -def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_report()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") - assert actual == expected - - def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): """Tests CvnkitStepPart.get_resource_usage()""" # Define expected: default defined workflow.abstract @@ -414,12 +470,6 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): "memory": "16G", "partition": "medium", } - report_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } # Evaluate action `target` for resource, expected in target_expected_dict.items(): @@ -445,12 +495,6 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() assert actual == expected, msg_error - # Evaluate action `report` - for resource, expected in report_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'report'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource)() - assert actual == expected, msg_error - # Tests for AccessStepPart ------------------------------------------------------------------------- @@ -460,11 +504,22 @@ def test_access_step_part_get_input_files_run(panel_of_normals_workflow): assert panel_of_normals_workflow.get_input_files("access", "run") is None +def test_access_step_part_get_params_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_params_run()""" + expected = { + "reference": "/path/to/ref.fa", + "exclude": ["/path/to/exclude.bed"], + "min_gap_size": 0 + } + actual = panel_of_normals_workflow.get_params("access", "run") + assert actual == expected + + def test_access_step_part_get_output_files_run(panel_of_normals_workflow): """Tests AccessStepPart._get_output_files_run()""" expected = { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", + "access": "work/access/out/access.bed", + "access_md5": "work/access/out/access.bed.md5", } actual = panel_of_normals_workflow.get_output_files("access", "run") assert actual == expected @@ -472,7 +527,9 @@ def test_access_step_part_get_output_files_run(panel_of_normals_workflow): def test_access_step_part_get_log_file_run(panel_of_normals_workflow): """Tests AccessStepPart._get_log_file_run()""" - expected = get_expected_log_files_dict(base_out="work/cnvkit.access/log/cnvkit.access") + expected = get_expected_log_files_dict(base_out="work/access/log/access") + expected["sh"] = "work/access/log/access.sh" + expected["sh_md5"] = expected["sh"] + ".md5" actual = panel_of_normals_workflow.get_log_file("access", "run") assert actual == expected @@ -657,39 +714,33 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += [ tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) ] - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" + tpl = "output/{mapper}.cnvkit/out/cnvkit.{substep}.{ext}" for substep in ("target", "antitarget"): expected += [ tpl.format(substep=substep, mapper=mapper, ext=ext) for ext in ("bed", "bed.md5") for mapper in ("bwa",) ] - tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" - for substep in ("sex", "metrics"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("tsv", "tsv.md5") - for mapper in ("bwa",) - ] # add log files - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" - for substep in ("target", "antitarget", "panel_of_normals", "report"): + tpl = "output/{mapper}.cnvkit/log/cnvkit.{substep}" + for substep in ("target", "antitarget"): for mapper in ("bwa",): - expected += get_expected_log_files_dict( - base_out=tpl.format(mapper=mapper, substep=substep) - ).values() - # add merged log - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{chksum}" + base_out = tpl.format(mapper=mapper, substep=substep) + expected += get_expected_log_files_dict(base_out=base_out).values() + expected += [base_out + ".sh", base_out + ".sh.md5"] + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" for mapper in ("bwa",): - for chksum in ("", ".md5"): - expected += [tpl.format(mapper=mapper, chksum=chksum)] + base_out = tpl.format(mapper=mapper, substep=substep) + expected += get_expected_log_files_dict(base_out=base_out).values() + expected += [base_out + ".sh", base_out + ".sh.md5"] # Access - tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" + tpl = "output/access/out/access.{ext}" expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] expected += get_expected_log_files_dict( - base_out="output/cnvkit.access/log/cnvkit.access" + base_out="output/access/log/access" ).values() + expected += ["output/access/log/access.sh", "output/access/log/access.sh.md5"] # PureCN tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.rds{chksum}" From 53df41a9331bed5cf4523f50c4d554f7f776ece3 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 4 Nov 2024 12:22:51 +0100 Subject: [PATCH 29/46] feat: CNV calling step (WIP) --- .../workflows/somatic_cnv_calling/Snakefile | 61 + .../workflows/somatic_cnv_calling/__init__.py | 1005 +++++++++++++++++ .../somatic_cnv_calling/cnvkit.rules | 279 +++++ .../workflows/somatic_cnv_calling/model.py | 508 +++++++++ 4 files changed, 1853 insertions(+) create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/Snakefile create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/__init__.py create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules create mode 100644 snappy_pipeline/workflows/somatic_cnv_calling/model.py diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile new file mode 100644 index 000000000..23e608e34 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline somatic_cnv_calling step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_cnv_calling import ( + SomaticCnvCallingWorkflow, +) + +__author__ = "Eric Blanc " + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== + +wf = SomaticCnvCallingWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + +# Rules ======================================================================= + + +localrules: + # Linking files from work/ to output/ should be done locally + somatic_cnv_calling_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Generic linking out --------------------------------------------------------- + + +rule somatic_cnv_calling_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +# Somatic Targeted Sequencing CNV Calling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# cnvkit pipeline ------------------------------------------------------------- + + +# cnvkit requires a large number of rules, thus externalized +include: "cnvkit.rules" + diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py new file mode 100644 index 000000000..70bd2c447 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -0,0 +1,1005 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``somatic_cnv_calling`` step + +This step allows for the detection of CNV events for cancer samples from targeted sequenced (e.g., +exomes or large panels) or whole genome sequencing. +The wrapped tools start from the aligned reads (thus off ``ngs_mapping``) and generate CNV calls for somatic variants. + +The wrapped tools implement different strategies. Some work "reference free" and just use the +somatic BAM files for their input, some work in "matched cancer normal mode" and need the cancer +and normal BAM files, others again cancer BAM files, and additionally a +set of non-cancer BAM files for their background (the panel of normals). + +Some tools may also use germline & somatic variants to estimate allele-specific copy number changes, +and resolve loss-of-heterozygocity. In this case, the small variants need to be computed separately from the ``somatic_variant_calling`` step. + +Finally, some tools can use external estimation of tumor purity and ploidy. +This estimation can be either be provided in the sample sheet, or computed from the sequencing data by tools. + +========== +Step Input +========== + +Gene somatic CNV calling for targeted sequencing starts off the aligned reads, i.e., +``ngs_mapping``. + +Tools that use panel of normals can obtain their input in two different ways: + +- A static file, from another cohort or from public datasets. + In this case, the user is responsible to make sure that the data & methods used to create the panel are compatible to the cohort's. +- The ``panel_of_normals`` step. + The panel will be created if necessary, using the same conditions that for the cohort (genome release, exome kit assignment, ...) + +When requested, the optional germline and somatic small variant calls are created using a modified version of the ``somatic_variant_calling`` step. +The ``somatic__cnv_calling`` step generates the small variants (TODO: how exactly) and stores them (TODO: where exactly). + +Likewise, purity estimations can be automatically computed by the ``somatic__cnv_calling`` step, +to supplement or replace the estimations that may be provided in the samplesheet. + +=========== +Step Output +=========== + +TODO: The whole section of output needs revision. Main question is: what is the best format to encode CNAs? + +There is no widely used standard to report copy number alterations. +In absence of a better solution, all CNV tools implemented in somatic pipeline output the segmentation table loosely following the `DNAcopy format `_.` +The copy number call may or may not be present, and the chromosome number is replaced by its name. +The segmentation output is in file ``output/../out/.._dnacopy.seg``. + +:: + + output/ + +-- bwa.cnvkit.P001-N1-DNA1-WES1 + | |-- out + | | |-- bwa.cnvkitP001-N1-DNA1-WES1_dnacopy.seg + [...] + +Note that tool ``cnvetti`` doesn't follow the snappy convention above: +the tool name is followed by an underscore & the action, where the action is one of ``coverage``, ``segment`` and ``postprocess``. +For example, the output directory would contain a directory named ``bwa.cnvetti_coverage.P002-T1-DNA1-WES1``. + +.. note:: Tool-Specific Output + + Each tool produces its own set of outputs, generally not in standard format. + Some of these files are linked from ``work`` to ``output``, but not necessarily all of them. + Some tools (for example ``cnvkit``) also produces a report, with tables and figures. + + +===================== +Default Configuration +===================== + +The default configuration is as follows. + +.. include:: DEFAULT_CONFIG_somatic_targeted_seq_cnv_calling.rst + +===================================== +Available Somatic Targeted CNV Caller +===================================== + +- ``cnvkit`` (for both WGS & WES) +- ``sequenza`` (only WES) +- ``purecn`` (only WES) +- ``Control-FREEC`` (only WGS - this tools might not be supported) + +================================ +Logic of the step for ``cnvkit`` +================================ + +-------- +Overview +-------- + +``cnvkit`` was designed to call CNV on whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), +and the _antitargets_ (those regions outside of enrichment). +The coverage of _targets_ and _antitargets_ are expected to be very different, +but there is still information to be gained in the _antitarget_ regions, +albeit at a much lower resolution than for _target_ regions. + +``cnvkit`` was later used with some success on whole genome data. +WGS data was defined as _target_ regions covering the whole genome, with empty _antitarget_ regions. + +------------------------ +Sample-independent files +------------------------ + +``cvnkit`` allows the user to define _accessible_ regions (_via_ the ``access`` bed file). +This excludes repeats, low complexity or PAR regions, that cannot be properly mapped, and therefore used for CNV calling. + +For exome data, the _target_ regions are supposed to be well curated, so they are not affected by the _access_ regions. +The _antitarget_ regions, however, are only defined within _accessible_ regions. +For WGS data, the _antitarget_ regions are empty, and the _target_ regions are set to the _accessible_ regions, when present. +Even in the absence of user-defined _accessible_ regions, the _target_ and _antitarget_ regions will not contain long ``N`` sequences. + +Finally, the pipeline builds separates ``bed`` files for _target_ and _antitarget_ regions, for each exome kit present in the cohort, +and for WGS data if there is any. + +--------- +Reference +--------- + +The ``cnvkit`` authors recommend to use a panel of normals to normalize the coverage over bins. +This is usually created by running the ``panel_of_normals`` step. +The ``somatic_cnv_calling`` step will create a reference (panel of normals) if requested. +Otherwise, it is possible to use references created for different cohorts, but the user +must ensure that the data & methods used for the current cohort and to create the reference are compatible. +In particular, the exome enrichment kit must be identical, and the sex of the donors should be +similar (not to use a female-only reference for a male cohort, for example). + +If there are not enough normal samples to create such a reference, the corresponding normal sample +can be used, in a normal/tumor pair setting similar to the somatic small variant calling situation. + +In case no normals are available at all, a flat prior can be used. + +------------ +Calling CNVs +------------ + +The _target_ and _antitarget_ ``bed`` files created in the earlier sub-steps are used as input, +based on the exome kit (or WGS status). + +The coverage is computed for the tumor sample, and normalised using the reference. +As seen previously, the reference can be either exome kit-based, or sample-specific. + +The normalised coverage is the segmented, and copy numbers are called, optionally using +small variants and/or purity estimates. + +If B-allele fractions are used, the pipeline will create the small variants, only for samples +with a corresponding normal. +If purity is used, the user can choose to override the values in the sample sheet (when present) +with the output of the tool of her choice. +""" + +import os +import os.path +import re +import typing + +from biomedsheets.models import BioEntity, BioSample, TestSample, NGSLibrary +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import OutputFiles, Wildcards + +from snappy_pipeline.utils import dictify +from snappy_pipeline.workflows.abstract import ( + BaseStep, + BaseStepPart, + LinkOutStepPart, + ResourceUsage, +) +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow + +from .model import SomaticCnvCalling as SomaticCnvCallingConfigModel +from .model import Sex, LibraryKitDefinition, PanelOfNormalsOrigin + +__author__ = "Eric Blanc " + +#: Default configuration for the somatic_targeted_seq_cnv_calling step +DEFAULT_CONFIG = SomaticCnvCallingConfigModel.default_config_yaml_string() + +#: JSON key for "isCancer" +KEY_IS_CANCER = "isCancer" + +#: Value for "libraryType" is whole exome sequencing +VALUE_WES = "WES" + +#: Value for "libraryType" is panel sequencing +VALUE_PANEL = "Panel-seq" + +#: Values for targeted sequencing +VALUES_TARGETED_SEQ = (VALUE_WES, VALUE_PANEL) + +#: Standard key/extension values for BCF files +BCF_KEY_EXTS = ( + ("bcf", ".bcf"), + ("bcf_md5", ".bcf.md5"), + ("bcf_csi", ".bcf.csi"), + ("bcf_csi_md5", ".bcf.csi.md5"), +) + + +class SomaticCnvCallingStepPart(BaseStepPart): + """Shared code for all caller classes in somatic_targeted_seq_cnv_calling""" + + def __init__(self, parent: "SomaticCnvCallingWorkflow"): + super().__init__(parent) + + def _get_sample_sex(self, library_name: str) -> Sex: + if self.config.sex == Sex.MALE or self.config.sex == Sex.FEMALE: + sample_sex = self.config.sex + elif self.config.sex == Sex.SAMPLESHEET and library_name in self.parent.sex: + sample_sex = self.parent.sex[library_name] + else: + sample_sex = Sex.UNKNOWN + return sample_sex + + @staticmethod + @dictify + def _get_log_file_from_prefix(prefix: str) -> typing.Iterator[typing.Dict[str, str]]: + key_ext = ( + ("log", ".log"), + ("sh", ".sh"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + +class CnvKitStepPart(SomaticCnvCallingStepPart): + """Perform somatic targeted CNV calling using cnvkit""" + + #: Step name + name = "cnvkit" + + #: Class available actions + actions = ( + "access", + "target", + "antitarget", + "coverage", + "reference", + "flat_reference_panel", + "flat_reference_wgs", + "fix", + "segment", + "call", + "bintest", + "plot/diagram", + "plot/scatter", + "report/metrics", + "report/segmetrics", + ) + + # Overwrite defaults + default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h + + def __init__(self, parent: SomaticCnvCallingStepPart): + super().__init__(parent) + + def get_input_files(self, action: str) -> typing.Callable: + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + return getattr(self, "_get_input_files_{}".format(action.replace("/", "_"))) + + def get_params(self, action: str) -> typing.Callable: + """Return parameters input function, dependent on rule""" + # Validate action + self._validate_action(action) + return getattr(self, "_get_params_{}".format(action.replace("/", "_"))) + + def get_output_files(self, action: str) -> typing.Callable: + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + f = getattr(self, "_get_output_files_{}".format(action.replace("/", "_"))) + return f() + + def get_log_file(self, action: str) -> typing.Dict[str, str]: + """Return log files, dependent on rule""" + # Validate action + self._validate_action(action) + base_name = os.path.join("work", f"{{mapper}}.{self.name}.{{library_name}}", "log") + # Access, target & antitarget steps are cohort-wide, the others are library-dependent + if action in ("access",): + prefix = f"work/{self.name}/log/{action}" + elif action in ("target", "antitarget"): + prefix = f"work/{self.name}/log/{action}" + ".{panel_name}" + elif action in ("coverage",): + prefix = os.path.join(base_name, action + ".{region}") + elif action in ( + "reference", + "fix", + "segment", + "call", + "bintest", + "report/metrics", + "report/segmetrics", + ): + prefix = os.path.join(base_name, action.replace("/", "_")) + elif action in ("plot/diagram", "plot/scatter"): + prefix = os.path.join(base_name, action.replace("/", "_") + ".{contig_name}") + elif action == "flat_reference_panel": + prefix = f"work/{{mapper}}.{self.name}/log/reference.{{panel_name}}" + elif action == "flat_reference_wgs": + prefix = f"work/{{mapper}}.{self.name}/log/reference" + return SomaticCnvCallingStepPart._get_log_file_from_prefix(prefix) + + def get_result_files(self, library_name: str, mapper: str) -> typing.List[str]: + """Files to symlink to output""" + base_name = f"{mapper}.{self.name}.{library_name}" + result_files = [] + # Tumor samples + if library_name in self.parent.normal_library: + # Main results + prefix = os.path.join("output", base_name, "out", base_name) + for suffix in ("cnr", "segments.cns", "cns", "bintest.cnr"): + result_files.append(prefix + "." + suffix) + # Log files + prefix = os.path.join("output", base_name, "log") + for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + result_files.append(os.path.join(prefix, f"coverage.target.{ext}")) + result_files.append(os.path.join(prefix, f"coverage.antitarget.{ext}")) + for suffix in ("fix", "segment", "call", "bintest"): + for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + result_files.append(prefix + "/" + suffix + "." + ext) + # Log of reference is no panel of normals + # if not self.config[self.name]["panel_of_normals"]["enabled"]: + # normal_library = self.parent.normal_library[library_name] + # prefix = os.path.join("output", f"{mapper}.{self.name}.{normal_library}", "log", f"{mapper}.{self.name}.{normal_library}.reference") + # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + # result_files.append(prefix + "." + ext) + # Reports + if "reports" in self.config[self.name]: + prefix = os.path.join("output", base_name, "report", base_name) + for report in ("metrics", "segmetrics"): + if report in self.config[self.name]["reports"]: + result_files.append(prefix + "." + report + ".tsv") + # Plots (per chromosome) + if "plots" in self.config[self.name]: + prefix = os.path.join("output", base_name, "plot") + for plot in ("diagram", "scatter"): + if plot in self.config[self.name]["plots"]: + for contig in self.parent.contigs: + result_files.append(os.path.join(prefix, plot, contig + ".png")) + # else: # Normal samples + # prefix = os.path.join("output", base_name, "log", "reference") + # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): + # result_files.append(prefix + "." + ext) + return result_files + + # ----- Access -------------------------------------------------------------------------------- + + def _get_input_files_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return None + + def _get_params_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: + params = {"reference": self.w_config.static_data_config.reference.path} + params["min_gap_size"] = self.config[self.name]["access"]["min_gap_size"] + access = self.config[self.name]["access"].get("exclude", None) + if access: + params["access"] = access + + def _get_output_files_access(self) -> typing.Dict[str, str]: + return {"access": f"work/{self.name}/out/access.bed"} + + # ----- Target -------------------------------------------------------------------------------- + + def _get_input_files_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: + for panel in self.config.path_target_interval_list_mapping: + if panel.name == wildcards.panel_name: + return {"region": panel.path} + + def _get_params_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "split": self.config[self.name]["target"]["split"], + "avg_size": self.config[self.name]["target"]["avg_size"], + } + + def _get_output_files_target(self) -> typing.Dict[str, str]: + return {"region": f"work/{self.name}/out/{{panel_name}}_target.bed"} + + # ----- Antitarget ---------------------------------------------------------------------------- + + def _get_input_files_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # No antitarget for WGS + return { + "target": f"work/{self.name}/out/{wildcards.panel_name}_target.bed", + "access": f"work/{self.name}/out/access.bed", + } + + def _get_params_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "avg_size": self.config[self.name]["antitarget"]["avg_size"], + "min_size": self.config[self.name]["antitarget"]["min_size"], + } + + def _get_output_files_antitarget(self) -> typing.Dict[str, str]: + return {"region": f"work/{self.name}/out/{{panel_name}}_antitarget.bed"} + + # ----- Coverage ------------------------------------------------------------------------------ + + def _get_input_files_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # BAM/BAI file + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) + input_files = { + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), + } + + # Region (target or antitarget) file + panel = self.parent.libraryKit.get(wildcards.library_name, None) + if panel is None: + input_files["region"] = f"work/{self.name}/out/access.bed" + else: + input_files["region"] = f"work/{self.name}/out/{panel.name}_{wildcards.region}.bed" + return input_files + + def _get_params_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "fasta": self.w_config.static_data_config.reference.path, + "count": self.config[self.name]["coverage"]["count"], + "min_mapq": self.config[self.name]["coverage"]["min_mapq"], + "processes": self.default_resource_usage.threads, + } + + def _get_output_files_coverage(self) -> typing.Dict[str, str]: + return {"coverage": f"work/{{mapper}}.{self.name}.{{library_name}}/out/{{region}}.cnn"} + + # ----- Reference ----------------------------------------------------------------------------- + + def _get_input_files_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: + """Builds reference from the paired normal, or flat prior in absence of normal""" + input_files = {} + normal_library = self.parent.normal_library.get(wildcards.library_name, None) + input_files["normals"] = [ + f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/target.cnn", + f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/antitarget.cnn", + ] + return input_files + + def _get_params_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: + params = { + "fasta": self.w_config.static_data_config.reference.path, + "cluster": self.config[self.name]["reference"]["cluster"], + "min_cluster_size": self.config[self.name]["reference"]["min_cluster_size"], + "male_reference": self.config[self.name]["use_male_reference"], + "no_gc": self.config[self.name]["reference"]["no_gc"], + "no_edge": self.config[self.name]["reference"]["no_edge"], + "no_rmask": self.config[self.name]["reference"]["no_rmask"], + } + sample_sex = self._get_sample_sex(wildcards.library_name) + if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: + params["sample_sex"] = str(sample_sex) + return + + def _get_output_files_reference(self) -> typing.Dict[str, str]: + """TODO: flat prior reference should be library-independent""" + return {"reference": f"work/{{mapper}}.{self.name}.{{library_name}}/out/reference.cnn"} + + def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: + """Builds reference from the paired normal, or flat prior in absence of normal""" + input_files = {} + panel = self.parent.libraryKit.get(wildcards.library_name, None) + if panel is None: # WGS, target is access, no antitarget + input_files["target"] = f"work/{self.name}/out/access.bed" + else: # WES, both target & antitarget + input_files["target"] = f"work/{self.name}/out/{panel.name}_target.bed" + input_files["antitarget"] = f"work/{self.name}/out/{panel.name}_antitarget.bed" + return input_files + + def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_input_files_flat_reference_panel(wildcards) + + def _get_params_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_params_reference(wildcards) + + def _get_output_files_flat_reference_panel(self) -> typing.Dict[str, str]: + """TODO: flat prior reference should be library-independent""" + return {"reference": f"work/{{mapper}}.{self.name}/out/reference.{{panel_name}}.cnn"} + + def _get_input_files_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_input_files_flat_reference_panel(wildcards) + + def _get_params_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return self._get_params_reference(wildcards) + + def _get_output_files_flat_reference_wgs(self) -> typing.Dict[str, str]: + """TODO: flat prior reference should be library-independent""" + return {"reference": f"work/{{mapper}}.{self.name}/out/reference.cnn"} + + # ----- Fix ----------------------------------------------------------------------------------- + + def _get_input_files_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Coverage on targets + input_files = { + "target": f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/target.cnn" + } + # Coverage on antitargets when present (absent for WGS) + panel = self.parent.libraryKit.get(wildcards.library_name, None) + if panel is not None: # WGS - no antitarget + input_files["antitarget"] = ( + f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/antitarget.cnn" + ) + # Get reference from panel of normals if available, otherwise from normal or flat when no normal + if not self.config[self.name]["panel_of_normals"]["enabled"]: # Paired normal or flat + normal_library = self.parent.normal_library.get(wildcards.library_name, None) + if normal_library: + input_files["reference"] = ( + f"work/{{mapper}}.{self.name}.{normal_library}/out/reference.cnn" + ) + else: + if panel: + input_files["reference"] = ( + f"work/{{mapper}}.{self.name}/out/reference.{panel.name}.cnn" + ) + else: + input_files["reference"] = f"work/{{mapper}}.{self.name}/out/reference.cnn" + elif ( + self.config[self.name]["panel_of_normals"]["origin"] + == PanelOfNormalsOrigin.PREVIOUS_STEP + ): # Panel_of_normals step + input_files["reference"] = self.parent._get_panel_of_normals_path(self.name, panel) + else: + input_files["reference"] = self.config[self.name]["panel_of_normals"][ + "path_panel_of_normals" + ] + return input_files + + def _get_params_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "sample_id": wildcards.library_name, + "cluster": self.config[self.name]["fix"]["cluster"], + "no_gc": self.config[self.name]["fix"]["no_gc"], + "no_edge": self.config[self.name]["fix"]["no_edge"], + "no_rmask": self.config[self.name]["fix"]["no_rmask"], + } + + def _get_output_files_fix(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"coverage": os.path.join("work", base_name, "out", base_name + ".cnr")} + + # ----- Segment ------------------------------------------------------------------------------- + + def _get_input_files_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Coverage + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = {"coverage": f"work/{base_name}/out/{base_name}.cnr"} + # Segmentation using SNVs if requested and available (normal must be present) + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + return input_files + + def _get_params_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Segmentation parameters + params = { + "method": self.config[self.name]["segment"]["method"], + "threshold": self.config[self.name]["segment"]["threshold"], + "drop_low_coverage": self.config[self.name]["segment"]["drop_low_coverage"], + "drop_outliers": self.config[self.name]["segment"]["drop_outliers"], + } + if self.config[self.name]["segment"]["method"] == "cbs": + params["smooth_cbs"] = self.config[self.name]["segment"]["smooth_cbs"] + params["processes"] = self.default_resource_usage.threads + # Normal & tumor sample ids if SNVs + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + params["sample_id"] = wildcards.library_name + params["normal_id"] = self.normal_library[wildcards.library_name] + params["min_variant_depth"] = self.config[self.name]["segment"]["min_variant_depth"] + params["zygocity_freq"] = self.config[self.name]["segment"]["zygocity_freq"] + return params + + def _get_output_files_segment(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return { + "segments": os.path.join("work", base_name, "out", base_name + ".segments.cns"), + "dataframe": os.path.join("work", base_name, "out", "dataframe.rds"), + } + + # ----- Call ---------------------------------------------------------------------------------- + + def _get_input_files_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Segmentation + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = {"segments": f"work/{base_name}/out/{base_name}.segments.cns"} + # SNVs if requested and available (normal must be present) + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + # Purity from the tool if requested and not from the samplesheet + if ( + self.config[self.name]["purity"]["enabled"] and self.config[self.name]["purity"]["tool"] + ): # Need purity, and can use tool to obain it + if ( + self.config[self.name]["purity"]["ignore_samplesheet"] + or wildcards.library_name not in self.parent.purity + ): + # Don't use samplesheet + input_files["purity"] = ( + f"work/{base_name}/out/{wildcards.mapper}.{self.config.purity.tool}.txt" + ) + return input_files + + def _get_params_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + # Call parameters + params = { + "method": self.config[self.name]["call"]["method"], + "thresholds": self.config[self.name]["call"]["thresholds"], + "filter": self.config[self.name]["call"]["filter"], + "drop_low_coverage": self.config[self.name]["call"]["drop_low_coverage"], + "male_reference": self.config[self.name]["use_male_reference"], + } + # If center_at defined, use it, otherwise use the center method + center = self.config[self.name]["call"].get("center_at", None) + if center is not None: + params["center_at"] = center + else: + params["center"] = self.config[self.name]["call"].get("center", "None") + # Normal & tumor sample ids if SNVs + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + params["sample_id"] = wildcards.library_name + params["normal_id"] = self.normal_library[wildcards.library_name] + # Sample sex if known, otherwise guessed by the tool + sample_sex = self._get_sample_sex(wildcards.library_name) + if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: + params["sample_sex"] = sample_sex + # If requested, purity from samplesheet or from default if no tool + if self.config[self.name]["purity"]["enabled"]: + purity = self.parent.purity.get( + wildcards.library_name, self.config.purity.default_purity + ) + if purity is not None and not self.config[self.name]["purity"]["ignore_samplesheet"]: + params["purity"] = purity + if self.config.default_ploidy: + params["ploidy"] = self.config.default_ploidy + return params + + def _get_output_files_call(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"calls": os.path.join("work", base_name, "out", base_name + ".cns")} + + # ----- Bintest ------------------------------------------------------------------------------- + + def _get_input_files_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + return { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + + def _get_params_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "alpha": self.config[self.name]["bintest"]["alpha"], + "target": self.config[self.name]["bintest"]["target"], + } + + def _get_output_files_bintest(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"coverage": os.path.join("work", base_name, "out", base_name + ".bintest.cnr")} + + # ----- Plots -------------------------------------------------------------------------------- + + def _get_input_files_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + return input_files + + def _get_params_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "threshold": self.config[self.name]["plots"]["diagram"]["threshold"], + "min_probes": self.config[self.name]["plots"]["diagram"]["min_probes"], + "no_shift_xy": self.config[self.name]["plots"]["diagram"]["no_shift_xy"], + } + + def _get_output_files_plot_diagram(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"figure": os.path.join("work", base_name, "plot", "diagram", "{contig_name}.pdf")} + + def _get_input_files_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + input_files = { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + return input_files + + def _get_params_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: + params = { + "chromosome": wildcards.contig_name, + "antitarget_marker": self.config[self.name]["plots"]["scatter"]["antitarget_marker"], + "by_bin": self.config[self.name]["plots"]["scatter"]["by_bin"], + "segment_color": self.config[self.name]["plots"]["scatter"]["segment_color"], + "trend": self.config[self.name]["plots"]["scatter"]["trend"], + "y_max": self.config[self.name]["plots"]["scatter"]["y_max"], + "y_min": self.config[self.name]["plots"]["scatter"]["y_min"], + "fig_size": self.config[self.name]["plots"]["scatter"]["fig_size"], + "sample_id": wildcards.library_name, + } + variants = self.config[self.name].get("variants", None) + if variants and wildcards.library_name in self.normal_library: + params["normal_id"] = self.normal_library[wildcards.library_name] + params["min_variant_depth"] = self.config[self.name]["plots"]["scatter"][ + "min_variant_depth" + ] + params["zygocity_freq"] = self.config[self.name]["plots"]["scatter"]["zygocity_freq"] + return params + + def _get_output_files_plot_scatter(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"figure": os.path.join("work", base_name, "plot", "scatter", "{contig_name}.pdf")} + + # ----- Metrics (metrics & segmetrics) -------------------------------------------------------- + + def _get_input_files_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + return { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + + def _get_params_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return {"drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"]} + + def _get_output_files_report_metrics(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"report": os.path.join("work", base_name, "report", base_name + ".metrics.tsv")} + + def _get_input_files_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + return { + "coverage": f"work/{base_name}/out/{base_name}.cnr", + "segments": f"work/{base_name}/out/{base_name}.segments.cns", + } + + def _get_params_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + return { + "drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"], + "stats": ( + "mean", + "median", + "mode", + "t-test", + "stdev", + "sem", + "mad", + "mse", + "iqr", + "bivar", + "ci", + "pi", + ), + "alpha": self.config[self.name]["reports"]["alpha"], + "bootstrap": self.config[self.name]["reports"]["bootstrap"], + } + + def _get_output_files_report_segmetrics(self) -> typing.Dict[str, str]: + base_name = f"{{mapper}}.{self.name}.{{library_name}}" + return {"report": os.path.join("work", base_name, "report", base_name + ".segmetrics.tsv")} + + +class SomaticCnvCallingWorkflow(BaseStep): + """Perform somatic targeted sequencing CNV calling""" + + #: Workflow name + name = "somatic_cnv_calling" + + #: Default biomed sheet class + sheet_shortcut_class = CancerCaseSheet + + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=False) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + config_model_class=SomaticCnvCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes( + ( + CnvKitStepPart, + # ControlfreecStepPart, + # SequenzaStepPart, + # PureCNStepPart, + LinkOutStepPart, + ) + ) + # Initialize sub-workflows + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) + self.registered_pons = self._optionally_register_pon() + + # Collect extra information per library + self.normal_library = self._get_normal_library() + self.libraryKit = self._get_panel_information() + self.sex = self._get_sex() + self.purity = self._get_purity() + + def get_result_files(self) -> OutputFiles: + fns = [] + for seq_type, tools in self.config.tools: + for library in self._get_libraries(): + if library.extra_infos.get("libraryType", "").lower() != seq_type: + continue + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + bio_sample = test_sample.bio_sample + is_tumor = bio_sample.extra_infos.get("isTumor", True) + if is_tumor: + for tool in tools: + f = self.substep_getattr(tool, "get_result_files") + for mapper in self.w_config.step_config["ngs_mapping"]["tools"]["dna"]: + for fn in f(library.name, mapper): + fns.append(fn) + return OutputFiles(fns) + + def _get_libraries(self) -> typing.Iterator[NGSLibrary]: + for sheet in self.shortcut_sheets: + for donor in sheet.sheet.bio_entities.values(): + for bio_sample in donor.bio_samples.values(): + for test_sample in bio_sample.test_samples.values(): + for library in test_sample.ngs_libraries.values(): + yield library + + def _get_normal_library(self) -> typing.Dict[str, str]: + normal_for_donor = {} + for library in self._get_libraries(): + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + bio_sample = test_sample.bio_sample + is_tumor = bio_sample.extra_infos.get("isTumor", None) + if is_tumor is None: + raise ValueError(f"Missing 'isTumor' value for library '{library.name}'") + if is_tumor: + continue + donor = bio_sample.bio_entity + if donor.name in normal_for_donor: + raise ValueError(f"Multiple normals for donor '{donor.name}'") + normal_for_donor[donor.name] = library.name + + normal_library = {} + for library in self._get_libraries(): + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + bio_sample = test_sample.bio_sample + donor = bio_sample.bio_entity + if bio_sample.extra_infos.get("isTumor", True): + normal_library[library.name] = normal_for_donor[donor.name] + return normal_library + + def _optionally_register_pon(self) -> typing.Dict[str, str]: + """ + Register all possible combination of panel of normals: + - WGS PON for all configured WGS tools which require/can use it + - WES PON for all configured WES tools which require/can use it, one for each enrichment kit + + Note that there is no need to specify the genome release, + because the panel_of_normals step used here MUST be in the same project, + so it has the same configuration, and only one genome release is allowed per configuration. + """ + registered_pons = list() + for tool in self.config.tools.wgs: + pon_name = f"wgs.{tool}" + if pon_name in registered_pons: + continue + if self.config[tool].get("panel_of_normals", None) and self.config[ + tool + ].panel_of_normals.get("path_panel_of_normals_step", None): + self.register_sub_workflow( + "panel_of_normals", + self.config[tool].panel_of_normals.path_panel_of_normals_step, + pon_name, + ) + registered_pons.append(pon_name) + for tool in self.config.tools.wes: + for panel in self.config.path_target_interval_list_mapping: + pon_name = f"wes.{tool}.{panel.name}" + if pon_name in registered_pons: + continue + if self.config[tool].get("panel_of_normals", None) and self.config[ + tool + ].panel_of_normals.get("path_panel_of_normals_step", None): + self.register_sub_workflow( + "panel_of_normals", + self.config[tool].panel_of_normals.path_panel_of_normals_step, + pon_name, + ) + registered_pons.append(pon_name) + return registered_pons + + def _get_panel_information(self) -> typing.Dict[str, str]: + # Set default panel + default = None + for panel in self.config.path_target_interval_list_mapping: + if panel.name == "__default__": + default = panel + break + + # Extract library pattern (the "libraryKit" column in samplesheet) + # On output: + # - the panel name and panel path if libraryKit is present & known + # - the default panel path if libraryKit is undefined or not found + # - None for WGS + # - ValueError if libraryType is missing or unknown (not WES nor WGS) + libraryKit = {} + for library in self._get_libraries(): + test_sample = library.test_sample + if test_sample.extra_infos.get("extractionType", "") != "DNA": + continue + + libraryType = library.extra_infos.get("libraryType", None) + if libraryType is None: + raise ValueError(f"Missing library type for library '{library.name}'") + elif libraryType == "WES": + if library.extra_infos.get("libraryKit", None): + for panel in self.config.path_target_interval_list_mapping: + if re.match(panel.pattern, library.extra_infos.get("libraryKit")): + libraryKit[library.name] = panel + break + if library.name not in libraryKit: + libraryKit[library.name] = default + else: + libraryKit[library.name] = default + if libraryKit[library.name] is None: + raise ValueError(f"Undefined panel for library '{library.name}") + elif libraryType == "WGS": + libraryKit[library.name] = None + else: + raise ValueError( + f"Unknown library type '{libraryType}' for library '{library.name}'" + ) + + return libraryKit + + def _get_purity(self) -> typing.Dict[str, str]: + """Returns the purity value from the 'purity' library extra_infos. Missing otherwise""" + purity = {} + for library in self._get_libraries(): + p = library.extra_infos.get("purity", None) + if p: + try: + p = float(p) + if 0 <= p and p <= 1: + purity[library.name] = p + except: + pass + return purity + + def _get_sex(self) -> typing.Dict[str, Sex]: + sex = {} + for library in self._get_libraries(): + donor = library.test_sample.bio_sample.bio_entity + donor_sex = donor.extra_infos.get("sex", None) + if donor_sex == "male": + donor_sex = Sex.MALE + elif donor_sex == "female": + donor_sex = Sex.FEMALE + else: + donor_sex = Sex.UNKNOWN + sex[library.name] = donor_sex + return sex + + def _get_panel_of_normals_path(self, tool: str, panel: LibraryKitDefinition | None) -> str: + pon_path = None + assert self.config[tool]["panel_of_normals"][ + "enabled" + ], f"Panel of normals not enabled for '{tool}'" + assert ( + self.config[tool]["panel_of_normals"]["origin"] == PanelOfNormalsOrigin.PREVIOUS_STEP + ), f"'{tool}' panel of normals not from previous step" + if panel is None: + pon_id = f"wgs.{tool}" + else: + pon_id = f"wes.{tool}.{panel.name}" + assert pon_id in self.registered_pons, f"Requested panel '{pon_id}' not registered" + pon = self.parent.sub_workflows[pon_id] + pon_path = pon(f"output/{{mapper}}.{tool}/out/{panel.name}.ext") + return pon_path diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules new file mode 100644 index 000000000..8ec5d13fe --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules @@ -0,0 +1,279 @@ +rule somatic_targeted_seq_cnv_calling_cnvkit_access: + output: + **wf.get_output_files("cnvkit", "access"), + params: + wf.get_params("cnvkit", "access"), + log: + **wf.get_log_file("cnvkit", "access"), + threads: wf.get_resource("cnvkit", "access", "threads") + resources: + time=wf.get_resource("cnvkit", "access", "time"), + memory=wf.get_resource("cnvkit", "access", "memory"), + partition=wf.get_resource("cnvkit", "access", "partition"), + tmpdir=wf.get_resource("cnvkit", "access", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/access") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_target: + input: + unpack(wf.get_input_files("cnvkit", "target")), + params: + wf.get_params("cnvkit", "target"), + output: + **wf.get_output_files("cnvkit", "target"), + log: + **wf.get_log_file("cnvkit", "target"), + threads: wf.get_resource("cnvkit", "target", "threads") + resources: + time=wf.get_resource("cnvkit", "target", "time"), + memory=wf.get_resource("cnvkit", "target", "memory"), + partition=wf.get_resource("cnvkit", "target", "partition"), + tmpdir=wf.get_resource("cnvkit", "target", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/target") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: + input: + unpack(wf.get_input_files("cnvkit", "antitarget")), + params: + wf.get_params("cnvkit", "antitarget"), + output: + **wf.get_output_files("cnvkit", "antitarget"), + log: + **wf.get_log_file("cnvkit", "antitarget"), + threads: wf.get_resource("cnvkit", "antitarget", "threads") + resources: + time=wf.get_resource("cnvkit", "antitarget", "time"), + memory=wf.get_resource("cnvkit", "antitarget", "memory"), + partition=wf.get_resource("cnvkit", "antitarget", "partition"), + tmpdir=wf.get_resource("cnvkit", "antitarget", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/antitarget") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: + input: + unpack(wf.get_input_files("cnvkit", "coverage")), + params: + wf.get_params("cnvkit", "coverage"), + output: + **wf.get_output_files("cnvkit", "coverage"), + log: + **wf.get_log_file("cnvkit", "coverage"), + threads: wf.get_resource("cnvkit", "coverage", "threads") + resources: + time=wf.get_resource("cnvkit", "coverage", "time"), + memory=wf.get_resource("cnvkit", "coverage", "memory"), + partition=wf.get_resource("cnvkit", "coverage", "partition"), + tmpdir=wf.get_resource("cnvkit", "coverage", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/coverage") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_reference: + input: + unpack(wf.get_input_files("cnvkit", "reference")), + params: + wf.get_params("cnvkit", "reference"), + output: + **wf.get_output_files("cnvkit", "reference"), + log: + **wf.get_log_file("cnvkit", "reference"), + threads: wf.get_resource("cnvkit", "reference", "threads") + resources: + time=wf.get_resource("cnvkit", "reference", "time"), + memory=wf.get_resource("cnvkit", "reference", "memory"), + partition=wf.get_resource("cnvkit", "reference", "partition"), + tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/reference") + + +# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_panel: +# input: +# unpack(wf.get_input_files("cnvkit", "flat_reference_panel")), +# params: +# wf.get_params("cnvkit", "reference"), +# output: +# **wf.get_output_files("cnvkit", "flat_reference_panel"), +# log: +# **wf.get_log_file("cnvkit", "reference"), +# threads: wf.get_resource("cnvkit", "reference", "threads") +# resources: +# time=wf.get_resource("cnvkit", "reference", "time"), +# memory=wf.get_resource("cnvkit", "reference", "memory"), +# partition=wf.get_resource("cnvkit", "reference", "partition"), +# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), +# wrapper: +# wf.wrapper_path("cnvkit/reference") + + +# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_wgs: +# input: +# unpack(wf.get_input_files("cnvkit", "flat_reference_wgs")), +# params: +# wf.get_params("cnvkit", "reference"), +# output: +# **wf.get_output_files("cnvkit", "flat_reference_wgs"), +# log: +# **wf.get_log_file("cnvkit", "reference"), +# threads: wf.get_resource("cnvkit", "reference", "threads") +# resources: +# time=wf.get_resource("cnvkit", "reference", "time"), +# memory=wf.get_resource("cnvkit", "reference", "memory"), +# partition=wf.get_resource("cnvkit", "reference", "partition"), +# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), +# wrapper: +# wf.wrapper_path("cnvkit/reference") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_fix: + input: + unpack(wf.get_input_files("cnvkit", "fix")), + params: + wf.get_params("cnvkit", "fix"), + output: + **wf.get_output_files("cnvkit", "fix"), + log: + **wf.get_log_file("cnvkit", "fix"), + threads: wf.get_resource("cnvkit", "fix", "threads") + resources: + time=wf.get_resource("cnvkit", "fix", "time"), + memory=wf.get_resource("cnvkit", "fix", "memory"), + partition=wf.get_resource("cnvkit", "fix", "partition"), + tmpdir=wf.get_resource("cnvkit", "fix", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/fix") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_segment: + input: + unpack(wf.get_input_files("cnvkit", "segment")), + params: + wf.get_params("cnvkit", "segment"), + output: + **wf.get_output_files("cnvkit", "segment"), + log: + **wf.get_log_file("cnvkit", "segment"), + threads: wf.get_resource("cnvkit", "segment", "threads") + resources: + time=wf.get_resource("cnvkit", "segment", "time"), + memory=wf.get_resource("cnvkit", "segment", "memory"), + partition=wf.get_resource("cnvkit", "segment", "partition"), + tmpdir=wf.get_resource("cnvkit", "segment", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/segment") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_call: + input: + unpack(wf.get_input_files("cnvkit", "call")), + params: + wf.get_params("cnvkit", "call"), + output: + **wf.get_output_files("cnvkit", "call"), + log: + **wf.get_log_file("cnvkit", "call"), + threads: wf.get_resource("cnvkit", "call", "threads") + resources: + time=wf.get_resource("cnvkit", "call", "time"), + memory=wf.get_resource("cnvkit", "call", "memory"), + partition=wf.get_resource("cnvkit", "call", "partition"), + tmpdir=wf.get_resource("cnvkit", "call", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/call") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_bintest: + output: + **wf.get_output_files("cnvkit", "bintest"), + params: + wf.get_params("cnvkit", "bintest"), + log: + **wf.get_log_file("cnvkit", "bintest"), + threads: wf.get_resource("cnvkit", "bintest", "threads") + resources: + time=wf.get_resource("cnvkit", "bintest", "time"), + memory=wf.get_resource("cnvkit", "bintest", "memory"), + partition=wf.get_resource("cnvkit", "bintest", "partition"), + tmpdir=wf.get_resource("cnvkit", "bintest", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/bintest") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_plot_diagram: + input: + unpack(wf.get_input_files("cnvkit", "plot/diagram")), + params: + wf.get_params("cnvkit", "plot/diagram"), + output: + **wf.get_output_files("cnvkit", "plot/diagram"), + log: + **wf.get_log_file("cnvkit", "plot/diagram"), + threads: wf.get_resource("cnvkit", "plot/diagram", "threads") + resources: + time=wf.get_resource("cnvkit", "plot/diagram", "time"), + memory=wf.get_resource("cnvkit", "plot/diagram", "memory"), + partition=wf.get_resource("cnvkit", "plot/diagram", "partition"), + tmpdir=wf.get_resource("cnvkit", "plot/diagram", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/plot/diagram") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_plot_scatter: + input: + unpack(wf.get_input_files("cnvkit", "plot/scatter")), + params: + wf.get_params("cnvkit", "plot/scatter"), + output: + **wf.get_output_files("cnvkit", "plot/scatter"), + log: + **wf.get_log_file("cnvkit", "plot/scatter"), + threads: wf.get_resource("cnvkit", "plot/scatter", "threads") + resources: + time=wf.get_resource("cnvkit", "plot/scatter", "time"), + memory=wf.get_resource("cnvkit", "plot/scatter", "memory"), + partition=wf.get_resource("cnvkit", "plot/scatter", "partition"), + tmpdir=wf.get_resource("cnvkit", "plot/scatter", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/plot/scatter") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_report_metrics: + input: + unpack(wf.get_input_files("cnvkit", "report/metrics")), + params: + wf.get_params("cnvkit", "report/metrics"), + output: + **wf.get_output_files("cnvkit", "report/metrics"), + log: + **wf.get_log_file("cnvkit", "report/metrics"), + threads: wf.get_resource("cnvkit", "report/metrics", "threads") + resources: + time=wf.get_resource("cnvkit", "report/metrics", "time"), + memory=wf.get_resource("cnvkit", "report/metrics", "memory"), + partition=wf.get_resource("cnvkit", "report/metrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "report/metrics", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/report/metrics") + + +rule somatic_targeted_seq_cnv_calling_cnvkit_report_segmetrics: + input: + unpack(wf.get_input_files("cnvkit", "report/segmetrics")), + params: + wf.get_params("cnvkit", "report/segmetrics"), + output: + **wf.get_output_files("cnvkit", "report/segmetrics"), + log: + **wf.get_log_file("cnvkit", "report/segmetrics"), + threads: wf.get_resource("cnvkit", "report/segmetrics", "threads") + resources: + time=wf.get_resource("cnvkit", "report/segmetrics", "time"), + memory=wf.get_resource("cnvkit", "report/segmetrics", "memory"), + partition=wf.get_resource("cnvkit", "report/segmetrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "report/segmetrics", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/report/segmetrics") diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py new file mode 100644 index 000000000..ef10a9983 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -0,0 +1,508 @@ +import enum +import typing +from typing import Annotated + +from pydantic import Field, model_validator # , validator + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, Parallel + + +class WgsCaller(enum.StrEnum): + CNVKIT = "cnvkit" + CONTROL_FREEC = "control_freec" + + +class WesCaller(enum.StrEnum): + CNVKIT = "cnvkit" + PURECN = "purecn" + SEQUENZA = "sequenza" + + +class Tools(SnappyModel): + wgs: Annotated[typing.List[WgsCaller], EnumField(WgsCaller, [])] + """WGS calling tools""" + + wes: Annotated[typing.List[WesCaller], EnumField(WesCaller, [])] + """WES calling tools""" + + +class Sex(enum.StrEnum): + SAMPLESHEET = "samplesheet" + """Obtain the sex from the samplesheet""" + DIPLOID_ONLY = "diploid_only" + """Compute CNV for diploid chromosomes only""" + AUTO = "auto" + """Automatic sex detection using X/Y coverage""" + FEMALE = "female" + """Assume all samples are female""" + MALE = "male" + """Assume all samples are male""" + UNKNOWN = "unknown" + """Sex is unknown""" + + +class SequencingMethod(enum.StrEnum): + WES = "hybrid" + PANEL = "amplicon" + WGS = "wgs" + + +class LibraryKitDefinition(SnappyModel): + """ + Mapping from enrichment kit to target region BED file, for either computing per--target + region coverage or selecting targeted exons. + + The following will match both the stock IDT library kit and the ones + with spike-ins seen fromr Yale genomics. The path above would be + mapped to the name "default". + - name: IDT_xGen_V1_0 + pattern: "xGen Exome Research Panel V1\\.0*" + path: "path/to/targets.bed" + """ + + name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] + + pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] + + path: Annotated[str, Field(examples=["path/to/targets.bed"])] + + +class PanelOfNormalsOrigin(enum.StrEnum): + PREVIOUS_STEP = "previous_step" + """Use (& possibly create) a panel of normals from the current cohort of normals in the panel_of_normals step""" + STATIC = "static" + """Use an panel of normals from another cohort or from public data""" + + +class PanelOfNormals(SnappyModel): + enabled: bool = False + origin: PanelOfNormalsOrigin = PanelOfNormalsOrigin.PREVIOUS_STEP + path_panel_of_normals: str = "../panel_of_normals" + """ + Path to panel of normals created in current project + + The panel of normals can be either a file (typically from another project), + or from the current project's panel_of_normals step. + + In the latter case, the missing one(s) (in case there are more than one panel, or if there are WES & WGS) + will be created when not present. + The matching of genome release & exome baits is done on genome name & exome baits md5 checksum. + These are computed in the panel of normals step, and saved with the panel itself. + + There is no such matching if a panel of normal file is provided. The panel of normals validity is left to the user. + """ + + +class Mutect2(Parallel): + panel_of_normals: PanelOfNormals | None = None + """ + Panel of normals created by the PanelOfNormals program. + """ + + germline_resource: str + + common_variants: str | None = "" + """Common germline variants for contamination estimation""" + + arguments_for_purecn: bool = True + """ + PureCN requires that Mutect2 be called with arguments: + --genotype-germline-sites true --genotype-pon-sites true + """ + + extra_arguments: Annotated[ + typing.List[str], + # AfterValidator(argument), + Field( + examples=[ + "--read-filter CigarContainsNoNOperator", + "--annotation AssemblyComplexity BaseQuality", + ] + ), + ] = [] + """ + List additional Mutect2 arguments. + Each additional argument must be of the form: + "-- " + For example, to filter reads prior to calling & to add annotations to the output vcf: + - "--read-filter CigarContainsNoNOperator" + - "--annotation AssemblyComplexity BaseQuality" + """ + + window_length: int = 300000000 + + +class VariantTool(enum.StrEnum): + MUTECT2 = "mutect2" + + +class Variant(SnappyModel): + enabled: bool = False + tool: VariantTool | None = None + + mutect2: Mutect2 | None = None + + +class Ascat(SnappyModel): + pass + """TODO: configure purity tools (except for PureCN)""" + + +class Sequenza(SnappyModel): + pass + + +class ControlFreec(SnappyModel): + pass + + +class PureCn(SnappyModel): + panel_of_normals: PanelOfNormals + """ + Panel of normals created by the NormalDB.R script. + This is required even if the normal/tumor paired mode won't use it. + """ + + variants: VariantTool + + mappability: str = "" + """ + GRCh38: + /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw + """ + + reptiming: str = "" + """Nothing for GRCh38""" + + seed: int = 1234567 + extra_commands: typing.Dict[str, typing.Any] = { + "model": "betabin", + "fun-segmentation": "PSCBS", + "post-optimize": "", + } + """Recommended extra arguments for PureCN, extra_commands: {} to clear them all""" + + path_container: Annotated[ + str, Field(examples=["../panel_of_normals/work/containers/out/purecn.simg"]) + ] + """Conda installation not working well, container is required""" + + path_intervals: Annotated[ + str, + Field( + examples=[ + "../panel_of_normals/output/purecn/out/_.list" + ] + ), + ] + + +class PurityTool(enum.StrEnum): + ASCAT = "ascat" + PURECN = "purecn" + + +class Purity(SnappyModel): + enabled: bool = False + + ignore_samplesheet: bool = False + """Discard purity values in samplesheet when they exist""" + default_value: float | None = None + """Purity value for all samples""" + + tool: PurityTool | None = None + """Tool used for purity estimation, if not set, try samplesheet, otherwise default_value""" + + ascat: Ascat | None = None + + +class CnvkitSegmentationMethod(enum.StrEnum): + CBS = "cbs" + FLASSO = "flasso" + HAAR = "haar" + HMM = "hmm" + HMM_TUMOR = "hmm-tumor" + HMM_GERMLINE = "hmm-germline" + NONE = "none" + + +class CnvkitCallingMethod(enum.StrEnum): + THRESHOLD = "threshold" + CLONAL = "clonal" + NONE = "none" + + +class CnvkitCenterMethod(enum.StrEnum): + MEAN = "mean" + MEDIAN = "median" + MODE = "mode" + BIWEIGHT = "biweight" + + +class CnvkitFilterMethod(enum.StrEnum): + AMPDEL = "ampdel" + CN = "cn" + CI = "ci" + SEM = "sem" + + +class CnvkitAccess(SnappyModel): + exclude: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/cnvkit/access-5k-mappable.grch37.bed" + ] + ), + ] = None + """Regions accessible to mapping""" + + min_gap_size: int = 5000 + """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" + + +class CnvkitTarget(SnappyModel): + split: bool = False + """Split large tiled intervals into smaller, consecutive targets.""" + avg_size: float = 800 / 3 + """Average size of split target bins (results are approximate)""" + + +class CnvkitAntitarget(SnappyModel): + avg_size: float = 150000 + """Average size of split antitarget bins (results are approximate)""" + min_size: float | None = None + """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" + + +class CnvkitCoverage(SnappyModel): + count: bool = False + """Get read depths by counting read midpoints within each bin.""" + min_mapq: int = 0 + """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" + + +class CnvkitReference(SnappyModel): + cluster: bool = False + """Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles.""" + min_cluster_size: int = 4 + """Minimum cluster size to keep in reference profiles.""" + no_gc: bool = False + """Skip GC correction.""" + no_edge: bool = None + """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" + no_rmask: bool = False + """Skip RepeatMasker correction.""" + + +class CnvkitFix(SnappyModel): + cluster: bool = False + """Compare and use cluster-specific values present in the reference profile.""" + no_gc: bool = False + """Skip GC correction.""" + no_edge: bool = False + """Skip edge correction.""" + no_rmask: bool = False + """Skip RepeatMasker correction.""" + + +class CnvkitSegment(SnappyModel): + method: CnvkitSegmentationMethod = CnvkitSegmentationMethod.CBS + """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" + threshold: float = 0.0001 + """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + drop_outliers: float = 10 + """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" + smooth_cbs: bool = False + + min_variant_depth: float = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + + @model_validator(mode="after") + def ensure_smooth_for_cbs_only(self) -> typing.Self: + if self.smooth_cbs and self.method != CnvkitSegmentationMethod.CBS: + raise ValueError("'smooth_cbs' option can be used only with 'CBS' segmentation method") + return self + + +class CnvkitCall(SnappyModel): + method: CnvkitCallingMethod = CnvkitCallingMethod.THRESHOLD + """Calling method.""" + thresholds: str | None = None + """Hard thresholds for calling each integer copy number, separated by commas""" + center: CnvkitCenterMethod | None = CnvkitCenterMethod.MEDIAN + """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" + center_at: float | None = None + """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" + filter: CnvkitFilterMethod | None = None + """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" + ploidy: float | None = 2 + """Ploidy of the sample cells.""" + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + + min_variant_depth: float = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + + +class CnvkitBintest(SnappyModel): + alpha: float = 0.005 + """Significance threhold.""" + target: bool = False + """Test target bins only; ignore off-target bins.""" + + +class CnvkitPlotDiagram(SnappyModel): + threshold: float = 0.5 + """Copy number change threshold to label genes.""" + min_probes: int = 3 + """Minimum number of covered probes to label a gene.""" + no_shift_xy: bool = False + + +class CnvkitPlotScatter(SnappyModel): + antitarget_marker: str | None = None + """Plot antitargets using this symbol when plotting in a selected chromosomal region.""" + by_bin: bool = False + """Plot data x-coordinates by bin indices instead of genomic coordinates.""" + segment_color: str | None = None + """Plot segment lines in this color. Value can be any string accepted by matplotlib.""" + trend: bool = False + """Draw a smoothed local trendline on the scatter plot.""" + y_max: float | None = None + """y-axis upper limit.""" + y_min: float | None = None + """y-axis lower limit.""" + fig_size: typing.Tuple[float, float] | None = None + """Width and height of the plot in inches.""" + + min_variant_depth: float = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + + +class CnvkitPlot(SnappyModel): + diagram: CnvkitPlotDiagram = CnvkitPlotDiagram() + scatter: CnvkitPlotScatter = CnvkitPlotScatter() + + +class CnvkitReportMetrics(SnappyModel): + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + + +class CnvkitReportSegmetrics(SnappyModel): + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" + alpha: float = 0.05 + """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" + bootstrap: int = 100 + """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" + + +class CnvkitReport(enum.StrEnum): + METRICS = "metrics" + SEGMETRICS = "segmetrics" + + +class Cnvkit(SnappyModel): + panel_of_normals: PanelOfNormals | None = None + + variants: VariantTool | None = None + + purity: Purity + """ + When present, purity estimates can be used for calling segments. The requested tool must be configured. + Or the purity can be provided in the samplesheet, as an extra information attached to the library. + + Note that PureCN cannot be used to estimate purity for WGS samples (because PureCN is WES & Panel-only). + TODO: This should be tested by a validation method, I don't know how to do (Till help!!) + TODO: The exact name is not yet set. + """ + + access: CnvkitAccess = CnvkitAccess() + target: CnvkitTarget = CnvkitTarget() + antitarget: CnvkitAntitarget = CnvkitAntitarget() + coverage: CnvkitCoverage = CnvkitCoverage() + + reference: CnvkitReference | None = None + + @model_validator(mode="after") + def set_default_reference(self) -> typing.Self: + if self.reference is None and not self.panel_of_normals.enabled: + self.reference = CnvkitReference() + return self + + fix: CnvkitFix = CnvkitFix() + segment: CnvkitSegment = CnvkitSegment() + call: CnvkitCall = CnvkitCall() + bintest: CnvkitBintest = CnvkitBintest() + + use_male_reference: bool = False + """Create/use a male reference. Must be identical to panel of normals creation, when using one""" + + plots: typing.List[CnvkitPlot] = [] + + reports: typing.List[CnvkitReport] = [] + metrics: CnvkitReportMetrics | None = None + + # @validator("metrics") + # def get_default_reference(cls, v, values) -> CnvkitReportMetrics | None: + # if v is None and "metrics" in values["reports"]: + # return CnvkitReportMetrics() + # return None + + segmetrics: CnvkitReportSegmetrics | None = None + + # @validator("segmetrics") + # def get_default_reference(cls, v, values) -> CnvkitReportSegmetrics | None: + # if v is None and "segmetrics" in values["reports"]: + # return CnvkitReportSegmetrics() + # return None + + +class SomaticCnvCalling(SnappyStepModel): + path_ngs_mapping: str + """Path to bam files""" + + tools: Tools + """Tools for WGS & WES data""" + + path_target_interval_list_mapping: typing.List[LibraryKitDefinition] | None = None + + sex: Sex = Sex.DIPLOID_ONLY + + cnvkit: Cnvkit + purecn: PureCn | None = None + sequenza: Sequenza | None = None + control_freec: ControlFreec | None = None + + mutect2: Mutect2 | None = None + + default_ploidy: float | None = None + + # @model_validator(mode="after") + # def ensure_single_pon_step(self) -> typing.Self: + # """ + # I am not sure this is absolutely required. + # I am trying to avoid registering the panel_of_normals step when initializing SomaticCnvCalling + # """ + # pon_steps = set() + # for tool in itertools.chain(self.tools.wgs, self.tools.wes): + # tool_config = getattr(self, tool) + # if ( + # tool_config + # and getattr(tool_config, "use_panel_of_normals") + # and tool_config.use_panel_of_normals == PanelOfNormalsUse.PREVIOUS_STEP + # ): + # pon_steps.add(str(tool_config.panel_of_normals.panel_of_normals)) + # if len(pon_steps) > 1: + # raise ValueError("Too many panel_of_normals steps") + # return self From b2193d6e423a926c2dc1d088190bc38d319f6bea Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 6 Nov 2024 14:40:10 +0100 Subject: [PATCH 30/46] refactor: renamed params to args, automation of md5 checksum handling, fix path in cnvkit wrappers, better handling of arguments & fix method to obtain input, parameters & output from snakemake --- .../workflows/panel_of_normals/Snakefile | 46 +- .../workflows/panel_of_normals/__init__.py | 158 +++---- .../workflows/panel_of_normals/model.py | 13 +- .../wrappers/cnvkit/access/wrapper.py | 9 +- .../wrappers/cnvkit/antitarget/wrapper.py | 35 +- .../wrappers/cnvkit/autobin/environment.yaml | 1 + .../wrappers/cnvkit/autobin/wrapper.py | 36 ++ .../wrappers/cnvkit/cnvkit_wrapper.py | 30 +- .../wrappers/cnvkit/coverage/wrapper.py | 20 +- .../wrappers/cnvkit/reference/wrapper.py | 38 +- .../wrappers/cnvkit/target/wrapper.py | 16 +- .../test_workflows_panel_of_normals.py | 51 +-- .../test_workflows_panel_of_normals_wgs.py | 395 ------------------ 13 files changed, 255 insertions(+), 593 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py delete mode 100644 tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 5acd85d39..923431644 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -114,7 +114,7 @@ rule panel_of_normals_access_run: log: **wf.get_log_file("access", "run"), params: - **wf.get_params("access", "run"), + **wf.get_args("access", "run"), wrapper: wf.wrapper_path("cnvkit/access") @@ -135,7 +135,7 @@ rule panel_of_normals_cnvkit_access: log: **wf.get_log_file("cnvkit", "access"), params: - **{"args": wf.get_params("cnvkit", "access")}, + **{"args": wf.get_args("cnvkit", "access")}, wrapper: wf.wrapper_path("cnvkit/access") @@ -153,7 +153,7 @@ rule panel_of_normals_cnvkit_autobin: log: **wf.get_log_file("cnvkit", "autobin"), params: - **{"args": wf.get_params("cnvkit", "autobin")}, + **{"args": wf.get_args("cnvkit", "autobin")}, wrapper: wf.wrapper_path("cnvkit/autobin") @@ -171,7 +171,7 @@ rule panel_of_normals_cnvkit_target: log: **wf.get_log_file("cnvkit", "target"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "target")}, wrapper: wf.wrapper_path("cnvkit/target") @@ -189,7 +189,7 @@ rule panel_of_normals_cnvkit_antitarget: log: **wf.get_log_file("cnvkit", "antitarget"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "antitarget")}, wrapper: wf.wrapper_path("cnvkit/antitarget") @@ -207,7 +207,7 @@ rule panel_of_normals_cnvkit_coverage: log: **wf.get_log_file("cnvkit", "coverage"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "coverage")}, wrapper: wf.wrapper_path("cnvkit/coverage") @@ -228,27 +228,27 @@ rule panel_of_normals_cnvkit_create_panel: log: **wf.get_log_file("cnvkit", "create_panel"), params: - **{"args": wf.get_params("cnvkit", "target")}, + **{"args": wf.get_args("cnvkit", "create_panel")}, wrapper: wf.wrapper_path("cnvkit/reference") -rule panel_of_normals_cnvkit_report: - input: - unpack(wf.get_input_files("cnvkit", "report")), - output: - **wf.get_output_files("cnvkit", "report"), - threads: wf.get_resource("cnvkit", "report", "threads") - resources: - time=wf.get_resource("cnvkit", "report", "time"), - memory=wf.get_resource("cnvkit", "report", "memory"), - partition=wf.get_resource("cnvkit", "report", "partition"), - log: - **wf.get_log_file("cnvkit", "report"), - params: - **{"args": wf.get_params("cnvkit", "target")}, - wrapper: - wf.wrapper_path("cnvkit/report") +# rule panel_of_normals_cnvkit_report: +# input: +# unpack(wf.get_input_files("cnvkit", "report")), +# output: +# **wf.get_output_files("cnvkit", "report"), +# threads: wf.get_resource("cnvkit", "report", "threads") +# resources: +# time=wf.get_resource("cnvkit", "report", "time"), +# memory=wf.get_resource("cnvkit", "report", "memory"), +# partition=wf.get_resource("cnvkit", "report", "partition"), +# log: +# **wf.get_log_file("cnvkit", "report"), +# params: +# **{"args": wf.get_args("cnvkit", "report")}, +# wrapper: +# wf.wrapper_path("cnvkit/report") # Panel of normals (PureCN) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 77d76abc9..07656d4a2 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -273,19 +273,12 @@ def _get_extra_info(library): @dictify def _get_log_file(tpl, has_sh=False): """Return all log files files""" - ext_dict = { - "conda_list": "conda_list.txt", - "conda_list_md5": "conda_list.txt.md5", - "conda_info": "conda_info.txt", - "conda_info_md5": "conda_info.txt.md5", - "log": "log", - "log_md5": "log.md5", - } + ext_dict = {"conda_list": "conda_list.txt", "conda_info": "conda_info.txt", "log": "log"} if has_sh: ext_dict["sh"] = "sh" - ext_dict["sh_md5"] = ext_dict["sh"] + ".md5" for key, ext in ext_dict.items(): yield key, tpl + "." + ext + yield key + "_md5", tpl + "." + ext + ".md5" class PureCnStepPart(PanelOfNormalsStepPart): @@ -564,47 +557,49 @@ def get_input_files(self, action): "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, "create_panel": self._get_input_files_create_panel, - "report": self._get_input_files_report, } return mapping[action] - def get_params(self, action): + def get_args(self, action): """Return panel of normal files""" if action == "access": - return self._get_params_access + return self._get_args_access elif action == "autobin": - return self._get_params_autobin + return self._get_args_autobin elif action == "target": - return self._get_params_target + return self._get_args_target elif action == "antitarget": - return self._get_params_antitarget + return self._get_args_antitarget elif action == "coverage": - return self._get_params_coverage + return self._get_args_coverage elif action == "create_panel": - return self._get_params_create_panel - elif action == "report": - return self._get_params_report + return self._get_args_create_panel else: self._validate_action(action) def get_output_files(self, action): """Return panel of normal files""" + output_files = None if action == "access": - return self._get_output_files_access() + output_files = self._get_output_files_access() elif action == "autobin": - return self._get_output_files_autobin() + output_files = self._get_output_files_autobin() elif action == "target": - return self._get_output_files_target() + output_files = self._get_output_files_target() elif action == "antitarget": - return self._get_output_files_antitarget() + output_files = self._get_output_files_antitarget() elif action == "coverage": - return self._get_output_files_coverage() + output_files = self._get_output_files_coverage() elif action == "create_panel": - return self._get_output_files_create_panel() - elif action == "report": - return self._get_output_files_report() + output_files = self._get_output_files_create_panel() else: self._validate_action(action) + return dict( + zip( + list(output_files.keys()) + [k + "_md5" for k in output_files.keys()], + list(output_files.values()) + [v + ".md5" for v in output_files.values()], + ) + ) @classmethod def get_log_file(cls, action): @@ -616,7 +611,6 @@ def get_log_file(cls, action): "antitarget": "work/{mapper}.cnvkit/log/cnvkit.antitarget", "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage", "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", - "report": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report", } assert action in cls.actions return cls._get_log_file(tpls[action], has_sh=True) @@ -624,8 +618,11 @@ def get_log_file(cls, action): def _get_input_files_access(self, wildcards): return {} - def _get_params_access(self, wildcards): - return {"reference": self.w_config.static_data_config.reference.path} + def _get_args_access(self, wildcards): + return { + "reference": self.w_config.static_data_config.reference.path, + "min_gap_size": self.config.cnvkit.min_gap_size, + } def _get_output_files_access(self): return {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"} @@ -645,11 +642,11 @@ def _get_input_files_autobin(self, wildcards): input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) return input_files - def _get_params_autobin(self, wildcards): + def _get_args_autobin(self, wildcards): assert ( self.libraryType == LibraryType.WGS ), "Trying to estimate average target size for non-WGS samples" - params = {} + params = {"bp_per_bin": 50000} if self.name in self.config.tools and self.config.cnvkit: if self.config.cnvkit.get("access", "") == "": params["method"] = "wgs" @@ -659,11 +656,7 @@ def _get_params_autobin(self, wildcards): return params def _get_output_files_autobin(self): - return { - "result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt", - "target": "$TMPDIR/{mapper}.targets.bed", - "antitarget": "$TMPDIR/{mapper}.antitarget.bed", - } + return {"result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt"} def _get_input_files_target(self, wildcards): """Helper wrapper function to estimate target average size in wgs mode""" @@ -676,26 +669,23 @@ def _get_input_files_target(self, wildcards): ) return input_files - def _get_params_target(self, wildcards): + def _get_args_target(self, wildcards): params = {} if self.name in self.config.tools: if self.libraryType == LibraryType.WES: params["target"] = self.config.cnvkit.path_target_regions if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": params["target"] = self.config.cnvkit.get("access") - if "features" in self.w_config.static_data_config: + if self.w_config.static_data_config.get("features", None): params["annotate"] = self.w_config.static_data_config.features.path - if self.config.cnvkit.get("split", False): + if self.config.cnvkit.get("split", True): params["split"] = True if self.config.cnvkit.get("target_avg_size", None): params["avg_size"] = self.config.cnvkit.get("target_avg_size") return params def _get_output_files_target(self): - return { - "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", - } + return {"target": "work/{mapper}.cnvkit/out/cnvkit.target.bed"} def _get_input_files_antitarget(self, wildcards): """Helper wrapper function for computing antitarget locations""" @@ -705,22 +695,19 @@ def _get_input_files_antitarget(self, wildcards): "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed".format(**wildcards), } - def _get_params_antitarget(self, widlcards): + def _get_args_antitarget(self, wildcards): params = {} if self.name in self.config.tools: params = { "avg_size": self.config.cnvkit.antitarget_avg_size, "min_size": self.config.cnvkit.min_size, } - if self.config.cnvkit.get("access", "") == "": + if self.config.cnvkit.get("access", "") != "": params["access"] = self.config.cnvkit.get("access") return params def _get_output_files_antitarget(self): - return { - "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", - } + return {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"} def _get_input_files_coverage(self, wildcards): """Helper wrapper function for computing coverage""" @@ -733,7 +720,7 @@ def _get_input_files_coverage(self, wildcards): "bai": bam + ".bai", } - def _get_params_coverage(self, wildcards): + def _get_args_coverage(self, wildcards): params = {} if self.name in self.config.tools: params = { @@ -747,7 +734,6 @@ def _get_params_coverage(self, wildcards): def _get_output_files_coverage(self): return { "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", - "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", } def _get_input_files_create_panel(self, wildcards): @@ -763,9 +749,9 @@ def _get_input_files_create_panel(self, wildcards): ] else: antitargets = [] - return {"references": targets + antitargets} + return {"normals": targets + antitargets} - def _get_params_create_panel(self, wildcards): + def _get_args_create_panel(self, wildcards): params = {} if self.name in self.config.tools: params = { @@ -774,8 +760,8 @@ def _get_params_create_panel(self, wildcards): if self.config.cnvkit.get("cluster", False): params["cluster"] = True params["min_cluster_size"] = self.config.cnvkit.min_cluster_size - if "sample_sex" in self.config.cnvkit: - params["sample_sex"] = self.config.cnvkit.gender + if self.config.cnvkit.get("sample_sex"): + params["sample_sex"] = self.config.cnvkit.sample_sex if self.config.cnvkit.get("male_reference", False): params["male_reference"] = True if self.config.cnvkit.get("diploid_parx_genome", None): @@ -792,33 +778,7 @@ def _get_params_create_panel(self, wildcards): return params def _get_output_files_create_panel(self): - return { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - } - - def _get_input_files_report(self, wildcards): - """Helper wrapper function for the panel of normals report""" - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" - targets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries - ] - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" - antitargets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries - ] - return { - "target": targets, - "antitarget": antitargets, - } - - def _get_output_files_report(self): - return { - "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", - "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", - "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", - "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", - } + return {"panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"} class AccessStepPart(PanelOfNormalsStepPart): @@ -847,7 +807,7 @@ def get_output_files(self, action): tpl = "work/access/out/access.bed" return {"access": tpl, "access_md5": tpl + ".md5"} - def get_params(self, action): + def get_args(self, action): # Validate action self._validate_action(action) if self.name in self.config.tools: @@ -914,32 +874,25 @@ def get_result_files(self): """ result_files = [] - log_ext_list = [ - "log", - "log.md5", - "conda_list.txt", - "conda_list.txt.md5", - "conda_info.txt", - "conda_info.txt.md5", - ] + log_ext_list = ["log", "conda_list.txt", "conda_info.txt"] if "mutect2" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals.{ext}" - ext_list = ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") + ext_list = ("vcf.gz", "vcf.gz.tbi") result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.genomicsDB.{ext}" - ext_list = ("tar.gz", "tar.gz.md5") + ext_list = ("tar.gz",) result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) if "cnvkit" in set(self.config.tools) & set(TOOLS): tpls = [ - ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed", "bed.md5")), - ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed",)), + ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed",)), ( "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", - ("cnn", "cnn.md5"), + ("cnn",), ), # ( # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", @@ -958,7 +911,7 @@ def get_result_files(self): "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", ] for tpl in tpls: - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) # tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" # result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) @@ -966,14 +919,14 @@ def get_result_files(self): tpl = "output/access/out/access.bed" result_files.extend([tpl + md5 for md5 in ("", ".md5")]) tpl = "output/access/log/access.{ext}" - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh", "sh.md5"])) + result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) if "purecn" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.{ext}" - ext_list = ("rds", "rds.md5") + ext_list = ("rds",) result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.purecn/out/{mapper}.purecn.mapping_bias.{ext}" - ext_list = ("rds", "rds.md5") + ext_list = ("rds",) result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) @@ -981,7 +934,7 @@ def get_result_files(self): self.config.purecn.enrichment_kit_name, self.config.purecn.genome_name, ) - ext_list = ("list", "list.md5", "bed.gz", "bed.gz.md5", "bed.gz.tbi", "bed.gz.tbi.md5") + ext_list = ("list", "bed.gz", "bed.gz.tbi") result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/purecn/log/{}_{}.{{ext}}".format( self.config.purecn.enrichment_kit_name, @@ -995,3 +948,4 @@ def _expand_result_files(self, tpl, ext_list): for mapper in self.w_config.step_config["ngs_mapping"].tools.dna: for ext in ext_list: yield tpl.format(mapper=mapper, ext=ext) + yield tpl.format(mapper=mapper, ext=ext) + ".md5" diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index b7995e32c..802b5e417 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -75,19 +75,22 @@ class CnvKit(SnappyModel): access: str = "" """Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run)""" - target_avg_size: float | None = None + min_gap_size: int = 5000 + """[access] Minimum gap size between accessible regions""" + + target_avg_size: int | None = None """[target] Average size of split target bins (None: use default value, or use autobin for wgs)""" - split: bool = False + split: bool = True """[target] Split large intervals into smaller ones""" - bp_per_bin: float = 50000 + bp_per_bin: int = 50000 """[autobin] Expected base per bin""" - antitarget_avg_size: float = 0 + antitarget_avg_size: int = 0 """[antitarget] Average size of antitarget bins (0: use default value)""" - min_size: float = 0 + min_size: int = 0 """[antitarget] Min size of antitarget bins (0: use default value)""" min_mapq: int = 0 diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 5954500e9..c93c981c5 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -15,15 +15,18 @@ __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ - --min-gap-size {snakemake.params.min_gap_size} \ + --min-gap-size {args[min_gap_size]} \ {exclude} \ - {snakemake.params.reference} + {args[reference]} """.format( snakemake=snakemake, - exclude=" ".join([f"--exclude {x}" for x in snakemake.params.exclude]) if snakemake.params.exclude else "", + args=args, + exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]) if "exclude" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index da3b440f0..596626831 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -1,20 +1,35 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py antitarget""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" -cmd = r""" -cnvkit.py antitarget \ - -o {snakemake.output.region} \ - --avg-size {snakemake.params.avg_size} --min-size {snakemake.params.min_size} \ - {access} \ - {snakemake.input.target} -""".format( - snakemake=snakemake, - access=f"--access {snakemake.params.access}" if snakemake.params.access else "", -) +args = snakemake.params.get("args", {}) + +if snakemake.input.get("target", "") != "": + cmd = r""" + cnvkit.py antitarget \ + -o {snakemake.output.antitarget} \ + --avg-size {args['avg_size']} --min-size {args['min_size']} \ + {access} \ + {snakemake.input.target} + """.format( + snakemake=snakemake, + args=args, + access=f"--access {args['access']}" if "access" in args else "", + ) +else: + cmd = f"touch {snakemake.output.antitarget}" CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml b/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py new file mode 100644 index 000000000..ce913b505 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py autobin (replicating cnvkit batch)""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py autobin --method {args[method]} \ + {out_target} {out_antitarget} \ + {access} {target} \ + --bp-per-bin {args[bp_per_bin]} \ + {snakemake.input.bams} \ + > {snakemake.output.result} +""".format( + snakemake=snakemake, + args=args, + out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", + out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", + access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) else "", + target=f"--targets {args['target']}" if "target" in args else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py index a6a8accdf..08c734504 100644 --- a/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/cnvkit_wrapper.py @@ -1,5 +1,7 @@ """Abstract wrapper for cnvkit.py""" +import os +import stat import textwrap from snakemake.shell import shell @@ -10,7 +12,7 @@ class CnvkitWrapper: header = r""" - # Also pipe everything to log file + # Pipe everything to log file if [[ -n "{snakemake.log.log}" ]]; then if [[ "$(set +e; tty; set -e)" != "" ]]; then rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) @@ -21,13 +23,18 @@ class CnvkitWrapper: fi fi + # Compute md5 except when filename ends with .md5 compute_md5() {{ fn=$1 f=$(basename $fn) d=$(dirname $fn) - pushd $d 1> /dev/null 2>&1 - md5sum $f > $f.md5 - popd 1> /dev/null 2>&1 + ext="${{f##*.}}" + if [[ $ext != "md5" ]] + then + pushd $d 1> /dev/null 2>&1 + md5sum $f > $f.md5 + popd 1> /dev/null 2>&1 + fi }} # Write out information about conda installation. @@ -72,14 +79,23 @@ def preamble(self): def run(self) -> None: self.preamble() - with open(self.snakemake.log.sh, "wt") as f: + cmd_path = self.snakemake.log.sh + with open(cmd_path, "wt") as f: print( textwrap.dedent( - "\n".join((CnvkitWrapper.header, self.command, CnvkitWrapper.footer)) + "\n".join( + ( + CnvkitWrapper.header.format(snakemake=self.snakemake), + self.command, + CnvkitWrapper.footer.format(snakemake=self.snakemake), + ) + ) ), file=f, ) + current_permissions = stat.S_IMODE(os.lstat(cmd_path).st_mode) + os.chmod(cmd_path, current_permissions | stat.S_IXUSR) - shell(self.snakemake.log.sh) + shell(cmd_path) shell(CnvkitWrapper.md5_log.format(log=str(self.snakemake.log.log))) diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index 678a7e348..a71ef8e8e 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -1,20 +1,32 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py coverage""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" -cnvkit.py coverage --processes {snakemake.params.processes} \ +cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {snakemake.params.reference} - --min-mapq {snakemake.params.min_mapq} {count} \ + --fasta {args[reference]} \ + --min-mapq {args[min_mapq]} {count} \ {snakemake.input.bam} {snakemake.input.intervals} """.format( snakemake=snakemake, - count="--count" if snakemake.params.count else "", + args=args, + count="--count" if "count" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 4d53e9508..82f71ce41 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -1,32 +1,44 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py reference""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" cnvkit.py reference \ - -o {snakemake.output.reference} \ - --fasta {snakemake.params.reference} \ + -o {snakemake.output.panel} \ + --fasta {args[reference]} \ {cluster} {min_cluster_size} \ {sample_sex} {male_reference} {diploid_parx_genome} \ {no_gc} {no_edge} {no_rmask} \ {target} {antitarget} {normals} """.format( snakemake=snakemake, - cluster="--cluster" if snakemake.params.cluster else "", - min_cluster_size=f"--min-cluster-size {snakemake.params.min_cluster_size}" if snakemake.params.cluster and "min_cluster_size" in snakemake.params else "", - no_gc="--no-gc" if snakemake.params.no_gc else "", - no_edge="--no-edge" if snakemake.params.no_edge else "", - no_rmask="--no-rmask" if snakemake.params.no_rmask else "", - sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", - male_reference="--male-reference" if snakemake.params.male_reference else "", - diploid_parx_genome=f"--diploid_parx_genome {snakemake.params.diploid_parx_genome}" if "diploid_parx_genome" in snakemake.params else "", - target=f"--target {snakemake.input.target}" if "target" in snakemake.input else "", - antitarget=f"--antitarget {snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", - normals=" ".join(snakemake.input.normals) if "normals" in snakemake.input else "", + args=args, + cluster="--cluster" if "cluster" in args else "", + min_cluster_size=f"--min-cluster-size {args['min_cluster_size']}" if "cluster" in args and "min_cluster_size" in args else "", + no_gc="--no-gc" if "no_gc" in args else "", + no_edge="--no-edge" if "no_edge" in args else "", + no_rmask="--no-rmask" if "no_rmask" in args else "", + sample_sex=f"--sample-sex {args['sample_sex']}" if "sample_sex" in args else "", + male_reference="--male-reference" if "male_reference" in args else "", + diploid_parx_genome=f"--diploid_parx_genome {args['diploid_parx_genome']}" if "diploid_parx_genome" in args else "", + target=f"--target {snakemake.input.target}" if snakemake.input.get("target", None) else "", + antitarget=f"--antitarget {snakemake.input.antitarget}" if snakemake.input.get("antitarget", None) else "", + normals=" ".join(snakemake.input.normals) if snakemake.input.get("normals", None) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index 457718684..fb32e1710 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -16,31 +16,35 @@ __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + # WGS: targets are all accessible regions, WES: targets are baits -interval = snakemake.input.access if "access" in snakemake.input else snakemake.params.target +interval = snakemake.input.access if snakemake.input.get("access", None) else args["target"] -if "avg_size" in snakemake.input: +if snakemake.input.get("avg_size", "") != "": pattern = re.compile("^Target:[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)$") with open(snakemake.input.avg_size) as f: for line in f: m = pattern.match(line) if m: - avg_size = float(m.groups()[4]) + avg_size = int(float(m.groups()[4])) break else: - avg_size = snakemake.params.avg_size + avg_size = args["avg_size"] cmd = r""" cnvkit.py target \ -o {snakemake.output.target} \ - {avg_size} {split} \ + {avg_size} {split} {annotate} \ {interval} """.format( snakemake=snakemake, + args=args, interval=interval, avg_size=f"--avg-size {avg_size}", - split=f"--split" if snakemake.params.split else "", + split=f"--split" if "split" in args and args["split"] else "", + annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index c596a734b..46fd5a0fe 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -225,6 +225,7 @@ def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", ], + "access": "work/bwa.cnvkit/out/cnvkit.access.bed", } actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) assert actual == expected @@ -282,7 +283,7 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow } ) expected = { - "references": [ + "normals": [ "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], @@ -291,75 +292,75 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow assert actual == expected -def test_cnvkit_step_part_get_params_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_access()""" +def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = {"reference": "/path/to/ref.fa"} - actual = panel_of_normals_workflow.get_params("cnvkit", "access")(wildcards) + expected = {"reference": "/path/to/ref.fa", "min_gap_size": 5000} + actual = panel_of_normals_workflow.get_args("cnvkit", "access")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_autobin(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_autobin()""" +def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_autobin()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = {"method": "wgs"} - actual = panel_of_normals_workflow.get_params("cnvkit", "autobin")(wildcards) + expected = {"method": "wgs", "bp_per_bin": 50000} + actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_target()""" +def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_target()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) - expected = {"annotate": "/path/to/annotations.gtf"} - actual = panel_of_normals_workflow.get_params("cnvkit", "target")(wildcards) + expected = {"annotate": "/path/to/annotations.gtf", "split": True, "target": ""} + actual = panel_of_normals_workflow.get_args("cnvkit", "target")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_antitarget()""" +def test_cnvkit_step_part_get_args_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_antitarget()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) expected = {"avg_size": 0, "min_size": 0} - actual = panel_of_normals_workflow.get_params("cnvkit", "antitarget")(wildcards) + actual = panel_of_normals_workflow.get_args("cnvkit", "antitarget")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_coverage()""" +def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) expected = {"reference": "/path/to/ref.fa", "min_mapq": 0} - actual = panel_of_normals_workflow.get_params("cnvkit", "coverage")(wildcards) + actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_params_create_panel(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_params_create_panel()""" +def test_cnvkit_step_part_get_args_create_panel(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", } ) expected = {"reference": "/path/to/ref.fa", "no_edge": True} - actual = panel_of_normals_workflow.get_params("cnvkit", "create_panel")(wildcards) + actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")(wildcards) assert actual == expected @@ -504,14 +505,14 @@ def test_access_step_part_get_input_files_run(panel_of_normals_workflow): assert panel_of_normals_workflow.get_input_files("access", "run") is None -def test_access_step_part_get_params_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_params_run()""" +def test_access_step_part_get_args_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_args_run()""" expected = { "reference": "/path/to/ref.fa", "exclude": ["/path/to/exclude.bed"], "min_gap_size": 0 } - actual = panel_of_normals_workflow.get_params("access", "run") + actual = panel_of_normals_workflow.get_args("access", "run") assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py deleted file mode 100644 index e1f4c2b26..000000000 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py +++ /dev/null @@ -1,395 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests for the panel_of_normals workflow module code""" - -import textwrap - -import pytest -import ruamel.yaml as ruamel_yaml -from snakemake.io import Wildcards - -from snappy_pipeline.workflows.panel_of_normals import PanelOfNormalsWorkflow - -from .common import get_expected_log_files_dict -from .conftest import patch_module_fs - - -@pytest.fixture(scope="module") # otherwise: performance issues -def minimal_config(): - """Return YAML parsing result for (cancer) configuration""" - yaml = ruamel_yaml.YAML() - return yaml.load( - textwrap.dedent( - r""" - static_data_config: - reference: - path: /path/to/ref.fa - cosmic: - path: /path/to/cosmic.vcf.gz - dbsnp: - path: /path/to/dbsnp.vcf.gz - - step_config: - ngs_mapping: - tools: - dna: ['bwa'] - bwa: - path_index: /path/to/bwa/index.fa - - panel_of_normals: - path_ngs_mapping: NGS_MAPPING/ - tools: ['cnvkit'] - cnvkit: - path_target_regions: "" # WGS mode - path_normals_list: "" - - data_sets: - first_batch: - file: sheet.tsv - search_patterns: - - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} - search_paths: ['/path'] - type: matched_cancer - naming_scheme: only_secondary_id - """ - ).lstrip() - ) - - -@pytest.fixture -def panel_of_normals_workflow( - dummy_workflow, - minimal_config, - config_lookup_paths, - work_dir, - config_paths, - cancer_sheet_fake_fs, - aligner_indices_fake_fs, - mocker, -): - """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" - # Patch out file-system related things in abstract (the crawling link in step is defined there) - patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) - patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) - # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we - # can obtain paths from the function as if we really had a NGSMappingPipelineStep there - dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} - # Construct the workflow object - return PanelOfNormalsWorkflow( - dummy_workflow, - minimal_config, - config_lookup_paths, - config_paths, - work_dir, - ) - - -# Tests for CnvkitStepPart ------------------------------------------------------------------------ - - -def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_target()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "bams": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", - ], - "bais": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_antitarget()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) - assert actual == {} - - -def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_coverage()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - "normal_library": "P001-N1-DNA1-WGS1", - } - ) - expected = { - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", - "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_create_panel()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "target": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - "logs": [ - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P001-N1-DNA1-WGS1.coverage.conda_info.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.log", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_list.txt", - "work/bwa.cnvkit/log/bwa.cnvkit.P002-N1-DNA1-WGS1.coverage.conda_info.txt", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_report()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "target": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - "antitarget": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_target()""" - expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_antitarget()""" - expected = { - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_coverage()""" - expected = { - "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_create_panel()""" - expected = { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - "log": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz", - "log_md5": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_report()""" - expected = { - "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", - "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", - "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", - "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_target()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_antitarget()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_coverage()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_create_panel()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_report()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): - """Tests CvnkitStepPart.get_resource_usage()""" - # Define expected: default defined workflow.abstract - target_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - antitarget_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - coverage_expected_dict = { - "threads": 8, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - reference_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - report_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - - # Evaluate action `target` - for resource, expected in target_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'target'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource)() - assert actual == expected, msg_error - - # Evaluate action `antitarget` - for resource, expected in antitarget_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource)() - assert actual == expected, msg_error - - # Evaluate action `coverage` - for resource, expected in coverage_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource)() - assert actual == expected, msg_error - - # Evaluate action `create_panel` - for resource, expected in reference_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() - assert actual == expected, msg_error - - # Evaluate action `report` - for resource, expected in report_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'report'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource)() - assert actual == expected, msg_error - - -# PanelOfNormalsWorkflow -------------------------------------------------------------------------- - - -def test_panel_of_normals_workflow(panel_of_normals_workflow): - """Test simple functionality of the workflow""" - # Check created sub steps - expected = ["access", "cnvkit", "link_out", "mutect2", "purecn"] - actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) - assert actual == expected - expected = [] - - # Now for basic cnvkit files (panel of normal only) - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" - expected += [ - tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) - ] - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" - for substep in ("target", "antitarget"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("bed", "bed.md5") - for mapper in ("bwa",) - ] - tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" - for substep in ("sex", "metrics"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("tsv", "tsv.md5") - for mapper in ("bwa",) - ] - # add log files - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" - for substep in ("target", "antitarget", "panel_of_normals", "report"): - for mapper in ("bwa",): - expected += get_expected_log_files_dict( - base_out=tpl.format(mapper=mapper, substep=substep) - ).values() - # add merged log - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{chksum}" - for mapper in ("bwa",): - for chksum in ("", ".md5"): - expected += [tpl.format(mapper=mapper, chksum=chksum)] - - expected = list(sorted(expected)) - actual = list(sorted(panel_of_normals_workflow.get_result_files())) - assert actual == expected From 4ec562cf4087cea3ec008d9ebecaecae0fc8216d Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 6 Nov 2024 14:41:36 +0100 Subject: [PATCH 31/46] fix: allow python 3.12 syntax in models to co-exist with pre-3.12 syntax in wrappers --- snappy_wrappers/__init__.py | 2 ++ snappy_wrappers/wrappers/cnvkit/environment.yaml | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/snappy_wrappers/__init__.py b/snappy_wrappers/__init__.py index 1c66e6539..ded1b29ed 100644 --- a/snappy_wrappers/__init__.py +++ b/snappy_wrappers/__init__.py @@ -5,4 +5,6 @@ __author__ = """Manuel Holtgrewe""" __email__ = "manuel.holtgrewe@bih-charite.de" +__version__ = "0.2.1" + __all__ = ["__version__"] diff --git a/snappy_wrappers/wrappers/cnvkit/environment.yaml b/snappy_wrappers/wrappers/cnvkit/environment.yaml index 5c478d874..5a1152c0b 100644 --- a/snappy_wrappers/wrappers/cnvkit/environment.yaml +++ b/snappy_wrappers/wrappers/cnvkit/environment.yaml @@ -3,6 +3,6 @@ channels: - bioconda - nodefaults dependencies: - - python=3.12 - - cnvkit==0.9.8 - - htslib=1.21 + - python=3.10 + - cnvkit=0.9.11 + - htslib==1.21 From f4421f2c8ea199dba0bd5edcc11e67edd82ca561 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 6 Nov 2024 16:26:17 +0100 Subject: [PATCH 32/46] fix: fix autobin logic because normals & reference always present --- .../workflows/panel_of_normals/__init__.py | 34 +++++++------------ .../wrappers/cnvkit/target/wrapper.py | 7 ++-- .../test_workflows_panel_of_normals.py | 2 +- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 07656d4a2..240643323 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -633,27 +633,19 @@ def _get_input_files_autobin(self, wildcards): ), "Trying to estimate average target size for non-WGS samples" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" - bams = [ - ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) - for x in self.normal_libraries - ] - input_files = {"bams": bams} - if self.config.cnvkit.get("access", "") == "": - input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) - return input_files + return { + "bams": [ + ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) + for x in self.normal_libraries + ], + "access": "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards), + } def _get_args_autobin(self, wildcards): assert ( self.libraryType == LibraryType.WGS ), "Trying to estimate average target size for non-WGS samples" - params = {"bp_per_bin": 50000} - if self.name in self.config.tools and self.config.cnvkit: - if self.config.cnvkit.get("access", "") == "": - params["method"] = "wgs" - else: - params["method"] = "amplicon" - params["target"] = self.config.cnvkit.get("access") - return params + return {"method": "wgs", "bp_per_bin": 50000} def _get_output_files_autobin(self): return {"result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt"} @@ -663,10 +655,10 @@ def _get_input_files_target(self, wildcards): input_files = {} if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) - if self.config.cnvkit.get("target_avg_size", None) is None: - input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( - **wildcards - ) + if self.config.cnvkit.get("target_avg_size", None) is None: + input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( + **wildcards + ) return input_files def _get_args_target(self, wildcards): @@ -674,7 +666,7 @@ def _get_args_target(self, wildcards): if self.name in self.config.tools: if self.libraryType == LibraryType.WES: params["target"] = self.config.cnvkit.path_target_regions - if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": + if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") != "": params["target"] = self.config.cnvkit.get("access") if self.w_config.static_data_config.get("features", None): params["annotate"] = self.w_config.static_data_config.features.path diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index fb32e1710..fe08248ff 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -29,9 +29,10 @@ if m: avg_size = int(float(m.groups()[4])) break - -else: +elif "avg_size" in args: avg_size = args["avg_size"] +else: + avg_size = None cmd = r""" cnvkit.py target \ @@ -42,7 +43,7 @@ snakemake=snakemake, args=args, interval=interval, - avg_size=f"--avg-size {avg_size}", + avg_size=f"--avg-size {avg_size}" if avg_size is not None else "", split=f"--split" if "split" in args and args["split"] else "", annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 46fd5a0fe..86870f4c8 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -323,7 +323,7 @@ def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): "mapper": "bwa", } ) - expected = {"annotate": "/path/to/annotations.gtf", "split": True, "target": ""} + expected = {"annotate": "/path/to/annotations.gtf", "split": True} actual = panel_of_normals_workflow.get_args("cnvkit", "target")(wildcards) assert actual == expected From 4b390c3a8588a5014c9bc9dd2d8751ace0d12e0c Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 13 Nov 2024 17:10:53 +0100 Subject: [PATCH 33/46] refactor: configuration for tools appearing in multiple steps addded to the generic models --- snappy_pipeline/models/cnvkit.py | 310 ++++++++----- snappy_pipeline/models/library_kit.py | 29 ++ snappy_pipeline/models/mutect2.py | 612 ++++++++++++++++++++++++++ 3 files changed, 848 insertions(+), 103 deletions(-) create mode 100644 snappy_pipeline/models/library_kit.py create mode 100644 snappy_pipeline/models/mutect2.py diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index e80ae634f..f1782e588 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -1,11 +1,37 @@ import enum -from typing import Annotated +from typing import Self -from pydantic import Field +from pydantic import model_validator from snappy_pipeline.models import SnappyModel +class SexOrigin(enum.StrEnum): + AUTOMATIC = "auto" + """Sex determined from the data""" + SAMPLESHEET = "samplesheet" + """Donor sex obtained from sample sheet""" + CONFIG = "config" + """Donor sex obtained from the configuration (all donors have the same sex)""" + + +class SexValue(enum.StrEnum): + MALE = "male" + FEMALE = "female" + + +class Sex(SnappyModel): + source: SexOrigin = SexOrigin.AUTOMATIC + + sample_sex: SexValue | None = None + + @model_validator(mode="after") + def ensure_valid_sex_value(self): + if self.source == SexOrigin.CONFIG and self.sample_sex is None: + raise ValueError("No definition of donors' sex from the configuration") + return self + + class SegmentationMethod(enum.StrEnum): cbs = "cbs" flasso = "flasso" @@ -16,14 +42,14 @@ class SegmentationMethod(enum.StrEnum): none = "none" -class CenterMode(enum.StrEnum): +class CenterMethod(enum.StrEnum): mean = "mean" median = "median" mode = "mode" biweight = "biweight" -class FilterMode(enum.StrEnum): +class FilterMethod(enum.StrEnum): ampdel = "ampdel" cn = "cn" ci = "ci" @@ -36,129 +62,207 @@ class CallingMethod(enum.StrEnum): none = "" -class Gender(enum.StrEnum): - male = "male" - female = "female" - guess = "" +class Access(SnappyModel): + exclude: list[str] = [] + """Regions accessible to mapping""" + min_gap_size: int = 5000 + """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" -class Cnvkit(SnappyModel): - path_target: Annotated[ - str, Field(examples=["../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed"]) - ] - """Path to target regions""" +class Target(SnappyModel): + path_baits: str | None = None + """Path to baits file (Agilent Covered), unset for WGS data""" + split: bool = True + """Split large tiled intervals into smaller, consecutive targets.""" + avg_size: float = (800/3) + """Average size of split target bins (results are approximate)""" + short_names: bool = False + """Reduce multi-accession bait labels to be short and consistent""" - path_antitarget: Annotated[ - str, - Field(examples=["../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed"]), - ] - """Path to antitarget regions""" - path_panel_of_normals: Annotated[ - str, - Field( - examples=[ - "../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn" - ] - ), - ] - """Path to panel of normals (reference)""" +class Antitarget(SnappyModel): + avg_size: float = 150000 + """Average size of split antitarget bins (results are approximate)""" + min_size: float | None = None + """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" - plot: bool = True - """Generate plots (very slow)""" - - min_mapq: int = 0 - """[coverage] Mininum mapping quality score to count a read for coverage depth""" +class Coverage(SnappyModel): count: bool = False - """[coverage] Alternative counting algorithm""" - - gc_correction: bool = True - """[fix] Use GC correction""" - - edge_correction: bool = True - """[fix] Use edge correction""" + """Get read depths by counting read midpoints within each bin.""" + min_mapq: int = 0 + """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" - rmask_correction: bool = True - """[fix] Use rmask correction""" - # BCBIO uses - # seg_method: haar - # seg_threshold: 0.0001 - # -- OR - # seg_method: cbs - # seg_threshold: 0.000001 - segmentation_method: SegmentationMethod = SegmentationMethod.cbs - """[segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none""" - segmentation_threshold: float = 0.000001 - """[segment] Significance threshold (hmm methods: smoothing window size)""" +class Fix(SnappyModel): + smoothing_window_fraction: float | None = None + """Smoothing window fraction for rolling median bias smoothing. Defaults to 1/sqrt(len(data))""" - drop_low_coverage: bool = False - """[segment, call, genemetrics] Drop very low coverage bins""" +class Segment(SnappyModel): + method: SegmentationMethod = SegmentationMethod.CBS + """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" + threshold: float = 0.0001 + """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" drop_outliers: int = 10 - """[segment] Drop outlier bins (0 for no outlier filtering)""" - - smooth_cbs: bool = True - """[segment] Additional smoothing of CBS segmentation (WARNING- not the default value)""" - - center: CenterMode | float | None = None - """[call] Either one of mean, median, mode, biweight, or a constant log2 ratio value.""" - - filter: FilterMode | str = FilterMode.ampdel - """ - [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), - "" for no filtering - """ + """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" + smooth_cbs: bool = False + + @model_validator(mode="after") + def ensure_smooth_for_cbs_only(self) -> Self: + if self.smooth_cbs and self.method != SegmentationMethod.CBS: + raise ValueError("'smooth_cbs' option can be used only with 'CBS' segmentation method") + return self + + +class Call(SnappyModel): + method: CallingMethod | None = None + """Calling method.""" + thresholds: list[float] = [-1.1, -0.25, 0.2, 0.7] + """Hard thresholds for calling each integer copy number, separated by commas""" + center: CenterMethod = CenterMethod.MEDIAN + """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" + center_at: float | None = None + """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" + filter: FilterMethod | None = None + """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" + + @model_validator(mode="after") + def avoid_center_center_at_conflict(self) -> Self: + if self.center is not None and self.center_at is not None: + raise ValueError("'call' options 'center' and 'center_at' cannot be used together") + return self + + +class Bintest(SnappyModel): + alpha: float = 0.005 + """Significance threhold.""" + target: bool = False + """Test target bins only; ignore off-target bins.""" + + +class Plot(SnappyModel): + enabled: bool = False + + +class PlotDiagram(Plot): + chromosome: str | None = None + """Chromosome to display (full genome when missing)""" + threshold: float = 0.5 + """Copy number change threshold to label genes.""" + min_probes: int = 3 + """Minimum number of covered probes to label a gene.""" + no_shift_xy: bool = False + + +class PlotScatter(Plot): + path_range_list: str | None = None + """File listing the chromosomal ranges to display, as BED, interval list or 'chr:start-end' text""" + gene: str | None = None + """Name of gene or genes (comma-separated) to display.""" + width: int = 1000000 + """Width of margin to show around the selected gene(s)""" + antitarget_marker: str = "o" + """Plot antitargets using this symbol when plotting in a selected chromosomal region.""" + by_bin: bool = False + """Plot data x-coordinates by bin indices instead of genomic coordinates.""" + segment_color: str = "darkorange" + """Plot segment lines in this color. Value can be any string accepted by matplotlib.""" + trend: bool = False + """Draw a smoothed local trendline on the scatter plot.""" + y_max: float | None = None + """y-axis upper limit.""" + y_min: float | None = None + """y-axis lower limit.""" + fig_size: tuple[float, float] = (6.4, 4.8) + """Width and height of the plot in inches.""" + + @model_validator(mode="after") + def ensure_range_list_with_gene(self) -> Self: + if self.gene is not None and not self.path_range_list: + raise ValueError("'gene' option requires a valid range list") + return self + + +class Report(SnappyModel): + enabled: bool = True + + +class ReportSegmetrics(Report): + alpha: float = 0.05 + """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" + bootstrap: int = 100 + """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" + smooth_bootstrap: bool = False + """Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval""" - calling_method: CallingMethod = CallingMethod.threshold - """[call] One of threshold, clonal, none""" - call_thresholds: str = "-1.1,-0.25,0.2,0.7" - """[call] Thresholds for calling integer copy number""" +class ReportGenemetrics(Report): + alpha: float = 0.05 + """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" + bootstrap: int = 100 + """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" + threshold: float = 0.2 + """Copy number change threshold to report a gene gain/loss""" + min_probes: int = 3 + """Minimum number of covered probes to report a gain/loss""" - ploidy: int = 2 - """[call] Ploidy of sample cells""" - purity: Annotated[float, Field(0, ge=0, le=1)] - """[call] Estimated tumor cell fraction (0 for discarding tumor cell purity)""" - gender: Gender = Gender.guess - """ - [call, diagram] Specify the chromosomal sex of all given samples as male or female. - Guess when missing - """ +class Report(enum.StrEnum): + GENEMETRICS = "genemetrics" + SEGMETRICS = "segmetrics" - male_reference: bool = False - """[call, diagram] Create male reference""" - diagram_threshold: float = 0.5 - """[diagram] Copy number change threshold to label genes""" - diagram_min_probes: int = 3 - """[diagram] Min number of covered probes to label genes""" +class CnvkitToReference(SnappyModel): + # Substep-secific parameters + access: Access + target: Target + antitarget: Antitarget - shift_xy: bool = True - """[diagram] Shift X & Y chromosomes according to sample sex""" + coverage: Coverage - breaks_min_probes: int = 1 - """[breaks] Min number of covered probes for a break inside the gene""" + metrics: Report + segmetrics: ReportSegmetrics + genemetrics: ReportGenemetrics - genemetrics_min_probes: int = 3 - """[genemetrics] Min number of covered probes to consider a gene""" + # Generic parameters (used in different substeps & must agree) + male_reference: bool = False + """Create/use male reference (for shifting chrX & chrY)""" + diploid_parx_genome: str | None = None + """Considers the given human genome's PAR of chromosome X as autosomal. Example: 'grch38'""" + + cluster: bool = False + """Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles.""" + min_cluster_size: int = 4 + """Minimum cluster size to keep in reference profiles.""" + + gc: bool = False + """Skip GC correction.""" + edge: bool = None + """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" + rmask: bool = False + """Skip RepeatMasker correction.""" - genemetrics_threshold: float = 0.2 - """[genemetrics] Min abs log2 change to consider a gene""" + drop_low_coverage: bool = False + """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - genemetrics_alpha: float = 0.05 - """[genemetrics] Significance cutoff""" + @model_validator(mode="after") + def ensure_males_for_reference(self): + if self.male_reference and self.sex.source == SexOrigin.CONFIG and self.sex.sample_sex == SexValue.FEMALE: + raise ValueError("Male reference requested for female cohort") + return self - genemetrics_bootstrap: int = 100 - """[genemetrics] Number of bootstraps""" - segmetrics_alpha: float = 0.05 - """[segmetrics] Significance cutoff""" +class Cnvkit(CnvkitToReference): + fix: Fix + segment: Segment + call: Call + bintest: Bintest - segmetrics_bootstrap: int = 100 - """[segmetrics] Number of bootstraps""" + diagram: PlotDiagram + scatter: PlotScatter - smooth_bootstrap: bool = False - """[segmetrics] Smooth bootstrap results""" + min_variant_depth: int = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" + zygocity_freq: float = 0.25 + """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" diff --git a/snappy_pipeline/models/library_kit.py b/snappy_pipeline/models/library_kit.py new file mode 100644 index 000000000..bd861aa72 --- /dev/null +++ b/snappy_pipeline/models/library_kit.py @@ -0,0 +1,29 @@ +from typing import Annotated +from pydantic import Field + +from snappy_pipeline.models import SnappyModel + + +class LibraryKitEntry(SnappyModel): + """ + Mapping from enrichment kit to target region BED file, for either computing per--target + region coverage or selecting targeted exons. + + The following will match both the stock IDT library kit and the ones + with spike-ins seen fromr Yale genomics. The path above would be + mapped to the name "default". + - name: IDT_xGen_V1_0 + pattern: "xGen Exome Research Panel V1\\.0*" + path: "path/to/targets.bed" + """ + + name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] + + pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] + + path: Annotated[str, Field(examples=["path/to/targets.bed"])] + + +class LibraryKit(SnappyModel): + path_target_interval_list_mapping: list[LibraryKitEntry] = [] + """Connects sample-based library kit in sample sheets with corresponding bed files""" diff --git a/snappy_pipeline/models/mutect2.py b/snappy_pipeline/models/mutect2.py new file mode 100644 index 000000000..d0ce7ec4b --- /dev/null +++ b/snappy_pipeline/models/mutect2.py @@ -0,0 +1,612 @@ +from enum import StrEnum + +from snappy_pipeline.models import SnappyModel + + +class Annotation(StrEnum): + AS_BASEQUALITYRANKSUMTEST = 'AS_BaseQualityRankSumTest' + AS_FISHERSTRAND = 'AS_FisherStrand' + AS_INBREEDINGCOEFF = 'AS_InbreedingCoeff' + AS_MAPPINGQUALITYRANKSUMTEST = 'AS_MappingQualityRankSumTest' + AS_QUALBYDEPTH = 'AS_QualByDepth' + AS_RMSMAPPINGQUALITY = 'AS_RMSMappingQuality' + AS_READPOSRANKSUMTEST = 'AS_ReadPosRankSumTest' + AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' + AS_STRANDODDSRATIO = 'AS_StrandOddsRatio' + ALLELEFRACTION = 'AlleleFraction' + ALLELEPSEUDODEPTH = 'AllelePseudoDepth' + ASSEMBLYCOMPLEXITY = 'AssemblyComplexity' + BASEQUALITY = 'BaseQuality' + BASEQUALITYHISTOGRAM = 'BaseQualityHistogram' + BASEQUALITYRANKSUMTEST = 'BaseQualityRankSumTest' + CHROMOSOMECOUNTS = 'ChromosomeCounts' + CLIPPINGRANKSUMTEST = 'ClippingRankSumTest' + COUNTNS = 'CountNs' + COVERAGE = 'Coverage' + CYCLESKIPSTATUS = 'CycleSkipStatus' + DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' + DEPTHPERSAMPLEHC = 'DepthPerSampleHC' + EXCESSHET = 'ExcessHet' + FEATURIZEDREADSETS = 'FeaturizedReadSets' + FISHERSTRAND = 'FisherStrand' + FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' + FRAGMENTLENGTH = 'FragmentLength' + GCCONTENT = 'GcContent' + GENOTYPESUMMARIES = 'GenotypeSummaries' + HAPLOTYPEFILTERINGANNOTATION = 'HaplotypeFilteringAnnotation' + HMERINDELLENGTH = 'HmerIndelLength' + HMERINDELNUC = 'HmerIndelNuc' + HMERMOTIFS = 'HmerMotifs' + INBREEDINGCOEFF = 'InbreedingCoeff' + INDELCLASSIFY = 'IndelClassify' + INDELLENGTH = 'IndelLength' + LIKELIHOODRANKSUMTEST = 'LikelihoodRankSumTest' + MAPPINGQUALITY = 'MappingQuality' + MAPPINGQUALITYRANKSUMTEST = 'MappingQualityRankSumTest' + MAPPINGQUALITYZERO = 'MappingQualityZero' + ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' + ORIGINALALIGNMENT = 'OriginalAlignment' + POSSIBLEDENOVO = 'PossibleDeNovo' + QUALBYDEPTH = 'QualByDepth' + RMSMAPPINGQUALITY = 'RMSMappingQuality' + RAWGTCOUNT = 'RawGtCount' + READPOSRANKSUMTEST = 'ReadPosRankSumTest' + READPOSITION = 'ReadPosition' + REFERENCEBASES = 'ReferenceBases' + SAMPLELIST = 'SampleList' + STRANDBIASBYSAMPLE = 'StrandBiasBySample' + STRANDODDSRATIO = 'StrandOddsRatio' + TANDEMREPEAT = 'TandemRepeat' + UNIQUEALTREADCOUNT = 'UniqueAltReadCount' + VARIANTTYPE = 'VariantType' + + + +class AnnotationGroup(StrEnum): + AS_STANDARDANNOTATION = 'AS_StandardAnnotation' + ALLELESPECIFICANNOTATION = 'AlleleSpecificAnnotation' + GENOTYPEANNOTATION = 'GenotypeAnnotation' + INFOFIELDANNOTATION = 'InfoFieldAnnotation' + JUMBOGENOTYPEANNOTATION = 'JumboGenotypeAnnotation' + JUMBOINFOANNOTATION = 'JumboInfoAnnotation' + REDUCIBLEANNOTATION = 'ReducibleAnnotation' + STANDARDANNOTATION = 'StandardAnnotation' + STANDARDFLOWBASEDANNOTATION = 'StandardFlowBasedAnnotation' + STANDARDHCANNOTATION = 'StandardHCAnnotation' + STANDARDMUTECTANNOTATION = 'StandardMutectAnnotation' + VARIANTANNOTATION = 'VariantAnnotation' + + + +class AnnotationExclude(StrEnum): + AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' + BASEQUALITY = 'BaseQuality' + COVERAGE = 'Coverage' + DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' + DEPTHPERSAMPLEHC = 'DepthPerSampleHC' + FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' + FRAGMENTLENGTH = 'FragmentLength' + MAPPINGQUALITY = 'MappingQuality' + ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' + READPOSITION = 'ReadPosition' + STRANDBIASBYSAMPLE = 'StrandBiasBySample' + TANDEMREPEAT = 'TandemRepeat' + + + +class DisableReadFilter(StrEnum): + GOODCIGARREADFILTER = 'GoodCigarReadFilter' + MAPPEDREADFILTER = 'MappedReadFilter' + MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' + MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' + MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' + NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' + NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' + NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' + PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' + READLENGTHREADFILTER = 'ReadLengthReadFilter' + WELLFORMEDREADFILTER = 'WellformedReadFilter' + + + +class IntervalMergingRule(StrEnum): + ALL = 'ALL' + OVERLAPPING_ONLY = 'OVERLAPPING_ONLY' + + + +class IntervalSetRule(StrEnum): + INTERSECTION = 'INTERSECTION' + UNION = 'UNION' + + + +class ReadFilter(StrEnum): + ALIGNMENTAGREESWITHHEADERREADFILTER = 'AlignmentAgreesWithHeaderReadFilter' + ALLOWALLREADSREADFILTER = 'AllowAllReadsReadFilter' + AMBIGUOUSBASEREADFILTER = 'AmbiguousBaseReadFilter' + CIGARCONTAINSNONOPERATOR = 'CigarContainsNoNOperator' + EXCESSIVEENDCLIPPEDREADFILTER = 'ExcessiveEndClippedReadFilter' + FIRSTOFPAIRREADFILTER = 'FirstOfPairReadFilter' + FLOWBASEDTPATTRIBUTESYMETRICREADFILTER = 'FlowBasedTPAttributeSymetricReadFilter' + FLOWBASEDTPATTRIBUTEVALIDREADFILTER = 'FlowBasedTPAttributeValidReadFilter' + FRAGMENTLENGTHREADFILTER = 'FragmentLengthReadFilter' + GOODCIGARREADFILTER = 'GoodCigarReadFilter' + HASREADGROUPREADFILTER = 'HasReadGroupReadFilter' + HMERQUALITYSYMETRICREADFILTER = 'HmerQualitySymetricReadFilter' + INTERVALOVERLAPREADFILTER = 'IntervalOverlapReadFilter' + JEXLEXPRESSIONREADTAGVALUEFILTER = 'JexlExpressionReadTagValueFilter' + LIBRARYREADFILTER = 'LibraryReadFilter' + MAPPEDREADFILTER = 'MappedReadFilter' + MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' + MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' + MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' + MATCHINGBASESANDQUALSREADFILTER = 'MatchingBasesAndQualsReadFilter' + MATEDIFFERENTSTRANDREADFILTER = 'MateDifferentStrandReadFilter' + MATEDISTANTREADFILTER = 'MateDistantReadFilter' + MATEONSAMECONTIGORNOMAPPEDMATEREADFILTER = 'MateOnSameContigOrNoMappedMateReadFilter' + MATEUNMAPPEDANDUNMAPPEDREADFILTER = 'MateUnmappedAndUnmappedReadFilter' + METRICSREADFILTER = 'MetricsReadFilter' + NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' + NONZEROFRAGMENTLENGTHREADFILTER = 'NonZeroFragmentLengthReadFilter' + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' + NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' + NOTOPTICALDUPLICATEREADFILTER = 'NotOpticalDuplicateReadFilter' + NOTPROPERLYPAIREDREADFILTER = 'NotProperlyPairedReadFilter' + NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' + NOTSUPPLEMENTARYALIGNMENTREADFILTER = 'NotSupplementaryAlignmentReadFilter' + OVERCLIPPEDREADFILTER = 'OverclippedReadFilter' + PAIREDREADFILTER = 'PairedReadFilter' + PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' + PLATFORMREADFILTER = 'PlatformReadFilter' + PLATFORMUNITREADFILTER = 'PlatformUnitReadFilter' + PRIMARYLINEREADFILTER = 'PrimaryLineReadFilter' + PROPERLYPAIREDREADFILTER = 'ProperlyPairedReadFilter' + READGROUPBLACKLISTREADFILTER = 'ReadGroupBlackListReadFilter' + READGROUPHASFLOWORDERREADFILTER = 'ReadGroupHasFlowOrderReadFilter' + READGROUPREADFILTER = 'ReadGroupReadFilter' + READLENGTHEQUALSCIGARLENGTHREADFILTER = 'ReadLengthEqualsCigarLengthReadFilter' + READLENGTHREADFILTER = 'ReadLengthReadFilter' + READNAMEREADFILTER = 'ReadNameReadFilter' + READSTRANDFILTER = 'ReadStrandFilter' + READTAGVALUEFILTER = 'ReadTagValueFilter' + SAMPLEREADFILTER = 'SampleReadFilter' + SECONDOFPAIRREADFILTER = 'SecondOfPairReadFilter' + SEQISSTOREDREADFILTER = 'SeqIsStoredReadFilter' + SOFTCLIPPEDREADFILTER = 'SoftClippedReadFilter' + VALIDALIGNMENTENDREADFILTER = 'ValidAlignmentEndReadFilter' + VALIDALIGNMENTSTARTREADFILTER = 'ValidAlignmentStartReadFilter' + WELLFORMEDFLOWBASEDREADFILTER = 'WellformedFlowBasedReadFilter' + WELLFORMEDREADFILTER = 'WellformedReadFilter' + + + +class ValidationStringency(StrEnum): + LENIENT = 'LENIENT' + SILENT = 'SILENT' + STRICT = 'STRICT' + + + +class LogLevel(StrEnum): + DEBUG = 'DEBUG' + ERROR = 'ERROR' + INFO = 'INFO' + WARNING = 'WARNING' + + + +class WriterType(StrEnum): + ALL_POSSIBLE_HAPLOTYPES = 'ALL_POSSIBLE_HAPLOTYPES' + CALLED_HAPLOTYPES = 'CALLED_HAPLOTYPES' + CALLED_HAPLOTYPES_NO_READS = 'CALLED_HAPLOTYPES_NO_READS' + NO_HAPLOTYPES = 'NO_HAPLOTYPES' + + + +class ReferenceConfidenceMode(StrEnum): + BP_RESOLUTION = 'BP_RESOLUTION' + GVCF = 'GVCF' + NONE = 'NONE' + + + +class FlowMode(StrEnum): + ADVANCED = 'ADVANCED' + NONE = 'NONE' + STANDARD = 'STANDARD' + + + +class Implementation(StrEnum): + FLOWBASED = 'FlowBased' + FLOWBASEDHMM = 'FlowBasedHMM' + PAIRHMM = 'PairHMM' + + + +class PairHMMImplementation(StrEnum): + AVX_LOGLESS_CACHING = 'AVX_LOGLESS_CACHING' + AVX_LOGLESS_CACHING_OMP = 'AVX_LOGLESS_CACHING_OMP' + EXACT = 'EXACT' + FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' + LOGLESS_CACHING = 'LOGLESS_CACHING' + ORIGINAL = 'ORIGINAL' + + + +class PCRErrorModel(StrEnum): + AGGRESSIVE = 'AGGRESSIVE' + CONSERVATIVE = 'CONSERVATIVE' + HOSTILE = 'HOSTILE' + NONE = 'NONE' + + + +class SmithWatermanImplementation(StrEnum): + AVX_ENABLED = 'AVX_ENABLED' + FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' + JAVA = 'JAVA' + + + +class Mutect2(SnappyModel): + # Required arguments + # input: bams tumor + normal + # output: raw vcf + # reference: fasta + + # Arguments actually used + + genotype_germline_sites: bool = False + """Call all apparent germline site even though they will ultimately be filtered""" + # germline_resource: FeatureInput | None = None # No options for class FeatureInput + germline_resource: str | None = None # No options for class FeatureInput + """Population vcf of germline sequencing containing allele fractions""" + + # Arguments that must be set by derived classes (pon & calling) + + # Panel of normals arguments + + + # Calling-specific arguments + + # panel_of_normals: str | None = None # Was class FeatureInput + # """VCF file of sites observed in normal""" + # genotype_pon_sites: bool = False + # """Call sites in the PoN even though they will ultimately be filtered""" + # mitochondria_mode: bool = False + # """Mitochondria mode sets emission and initial LODs to 0""" + # add_output_sam_program_record: bool = True + # """If true, adds a PG tag to created SAM/BAM/CRAM files""" + # assembly_region_out: str | None = None + # """Output the assembly region to this IGV formatted file""" + # bam_writer_type: WriterType = WriterType.CALLED_HAPLOTYPES + # """Which haplotypes should be written to the BAM""" + # enable_all_annotations: bool = False + # """Use all possible annotations (not for the faint of heart)""" + # alleles: str | None = None # Was class FeatureInput + # """The set of alleles to force-call regardless of evidence""" + # bam_output: bool = False # Was class str + # """Write assembled haplotypes""" + # pair_hmm_results_file: bool = False # Was class GATKPath + # """Write exact pairHMM inputs/outputs to for debugging purposes""" + + + # Optional arguments + + add_output_vcf_command_line: bool = True + """If true, adds a command line header line to created VCF files""" + af_of_alleles_not_in_resource: float = -1.0 + """Population allele fraction assigned to alleles not found in germline resource. Please see docs/mutect/mutect2.pdf fora derivation of the default value""" + annotation: list[Annotation] = [] + """One or more specific annotations to add to variant calls""" + annotation_group: list[AnnotationGroup] = [] + """One or more groups of annotations to apply to variant calls""" + annotations_to_exclude: list[AnnotationExclude] = [] + """One or more specific annotations to exclude from variant calls""" + arguments_file: str | None = None # was class File + """read one or more arguments files and add them to the command line""" + assembly_region_padding: int = 100 + """Number of additional bases of context to include around each assembly region""" + base_quality_score_threshold: int = 18 + """Base qualities below this threshold will be reduced to the minimum (6)""" + callable_depth: int = 10 + """Minimum depth to be considered callable for Mutect stats. Does not affect genotyping""" + disable_bam_index_caching: bool = False + """If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified. Caching is automatically disabled if there are no intervals specified""" + disable_read_filter: list[DisableReadFilter] = [] + """Read filters to be disabled before analysis""" + dont_use_dragstr_pair_hmm_scores: bool = False + """disable DRAGstr pair-hmm score even when dragstr-params-path was provided""" + downsampling_stride: int = 1 + """Downsample a pool of reads starting within a range of one or more bases""" + dragstr_het_hom_ratio: int = 2 + """het to hom prior ratio use with DRAGstr on""" + enable_dynamic_read_disqualification_for_genotyping: bool = False + """Will enable less strict read disqualification low base quality reads""" + exclude_intervals: list[str] = [] + """One or more genomic intervals to exclude from processing""" + f1r2_max_depth: int = 200 + """sites with depth higher than this value will be grouped""" + f1r2_median_mq: int = 50 + """skip sites with median mapping quality below this value""" + f1r2_min_bq: int = 20 + """exclude bases below this quality from pileup""" + flow_order_for_annotations: list[str] = [] + """flow order used for this annotations. [readGroup:]flowOrder""" + founder_id: list[str] = [] + """Samples representing the population 'founders'""" + gatk_config_file: str | None = None + """A configuration file to use with the GATK""" + ignore_itr_artifacts: bool = False + """Turn off read transformer that clips artifacts associated with end repair insertions near inverted tandem repeats""" + initial_tumor_lod: float = 2.0 + """Log 10 odds threshold to consider pileup active""" + interval_exclusion_padding: int = 0 + """Amount of padding (in bp) to add to each interval you are excluding""" + interval_merging_rule: IntervalMergingRule = IntervalMergingRule.ALL + """Interval merging rule for abutting intervals""" + interval_padding: int = 0 + """Amount of padding (in bp) to add to each interval you are including""" + interval_set_rule: IntervalSetRule = IntervalSetRule.UNION + """Set merging approach to use for combining interval inputs""" + intervals: list[str] = [] + """One or more genomic intervals over which to operate""" + lenient: bool = False + """Lenient processing of VCF files""" + max_assembly_region_size: int = 300 + """Maximum size of an assembly region""" + max_population_af: float = 0.01 + """Maximum population allele frequency in tumor-only mode""" + max_reads_per_alignment_start: int = 50 + """Maximum number of reads to retain per alignment start position. Reads above this threshold will be downsampled. Set to 0 to disable""" + max_variants_per_shard: int = 0 + """If non-zero, partitions VCF output into shards, each containing up to the given number of records""" + min_assembly_region_size: int = 50 + """Minimum size of an assembly region""" + min_base_quality_score: int = 10 + """Minimum base quality required to consider a base for calling""" + native_pair_hmm_use_double_precision: bool = False + """use double precision in the native pairHmm. This is slower but matches the java implementation better""" + normal_lod: float = 2.2 + """Log 10 odds threshold for calling normal variant non-germline""" + pcr_indel_qual: int = 40 + """Phred-scaled PCR indel qual for overlapping fragments""" + pcr_snv_qual: int = 40 + """Phred-scaled PCR SNV qual for overlapping fragments""" + read_filter: list[ReadFilter] = [] + """Read filters to be applied before analysis""" + read_validation_stringency: ValidationStringency = ValidationStringency.SILENT + """Validation stringency for all SAM/BAM/CRAM/SRA files read by this program. The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded""" + sites_only_vcf_output: bool = False + """If true, don't emit genotype fields when writing vcf file output""" + tumor_lod_to_emit: float = 3.0 + """Log 10 odds threshold to emit variant to VCF""" + use_jdk_deflater: bool = False + """Whether to use the JdkDeflater (as opposed to IntelDeflater)""" + use_jdk_inflater: bool = False + """Whether to use the JdkInflater (as opposed to IntelInflater)""" + verbosity: LogLevel = LogLevel.INFO + """Control verbosity of logging""" + QUIET: bool = False + """Whether to suppress job-summary info on System.err""" + + # Advanced arguments + + active_probability_threshold: float = 0.002 + """Minimum probability for a locus to be considered active""" + adaptive_pruning_initial_error_rate: float = 0.001 + """Initial base error rate estimate for adaptive pruning""" + allele_informative_reads_overlap_margin: int = 2 + """Likelihood and read-based annotations will only take into consideration reads that overlap the variant or any base no further than this distance expressed in base pairs""" + allow_non_unique_kmers_in_ref: bool = False + """Allow graphs that have non-unique kmers in the reference""" + debug_assembly: bool = False + """Print out verbose debug information about each assembly region""" + disable_adaptive_pruning: bool = False + """Disable the adaptive algorithm for pruning paths in the graph""" + disable_cap_base_qualities_to_map_quality: bool = False + """If false this disables capping of base qualities in the HMM to the mapping quality of the read""" + disable_symmetric_hmm_normalizing: bool = False + """Toggle to revive legacy behavior of asymmetrically normalizing the arguments to the reference haplotype""" + disable_tool_default_annotations: bool = False + """Disable all tool default annotations""" + disable_tool_default_read_filters: bool = False + """Disable all tool default read filters (WARNING: many tools will not function correctly without their default read filters on)""" + dont_increase_kmer_sizes_for_cycles: bool = False + """Disable iterating over kmer sizes when graph cycles are detected""" + dragstr_params_path: str | None = None # Was class GATKPath + """location of the DRAGstr model parameters for STR error correction used in the Pair HMM. When provided, it overrides other PCR error correcting mechanisms""" + emit_ref_confidence: ReferenceConfidenceMode = ReferenceConfidenceMode.NONE + """Mode for emitting reference confidence scores (For Mutect2, this is a BETA feature)""" + expected_mismatch_rate_for_read_disqualification: float = 0.02 + """Error rate used to set expectation for post HMM read disqualification based on mismatches""" + flow_assembly_collapse_partial_mode: bool = False + """Collapse long flow-based hmers only up to difference in reference""" + flow_disallow_probs_larger_than_call: bool = False + """Cap probabilities of error to 1 relative to base call""" + flow_fill_empty_bins_value: float = 0.001 + """Value to fill the zeros of the matrix with""" + flow_filter_alleles: bool = False + """pre-filter alleles before genotyping""" + flow_filter_alleles_qual_threshold: float = 30.0 + """Threshold for prefiltering alleles on quality""" + flow_filter_alleles_sor_threshold: float = 3.0 + """Threshold for prefiltering alleles on SOR""" + flow_filter_lone_alleles: bool = False + """Remove also lone alleles during allele filtering""" + flow_lump_probs: bool = False + """Should all probabilities of insertion or deletion in the flow be combined together""" + flow_matrix_mods: str | None = None + """Modifications instructions to the read flow matrix. Format is src,dst{,src,dst}+. Example: 10,12,11,12 - these instructions will copy element 10 into 11 and 12""" + flow_mode: FlowMode = FlowMode.NONE + """Single argument for enabling the bulk of Flow Based features. NOTE: THIS WILL OVERWRITE PROVIDED ARGUMENT CHECK TOOL INFO TO SEE WHICH ARGUMENTS ARE SET)""" + flow_probability_scaling_factor: int = 10 + """probability scaling factor for (phred=10) for probability quantization""" + flow_probability_threshold: float = 0.003 + """Lowest probability ratio to be used as an option""" + flow_quantization_bins: int = 121 + """Number of bins for probability quantization""" + flow_remove_non_single_base_pair_indels: bool = False + """Should the probabilities of more then 1 indel be used""" + flow_remove_one_zero_probs: bool = False + """Remove probabilities of basecall of zero from non-zero genome""" + flow_report_insertion_or_deletion: bool = False + """Report either insertion or deletion, probability, not both""" + flow_retain_max_n_probs_base_format: bool = False + """Keep only hmer/2 probabilities (like in base format)""" + flow_symmetric_indel_probs: bool = False + """Should indel probabilities be symmetric in flow""" + flow_use_t0_tag: bool = False + """Use t0 tag if exists in the read to create flow matrix""" + force_active: bool = False + """If provided, all regions will be marked as active""" + force_call_filtered_alleles: bool = False + """Force-call filtered alleles included in the resource specified by --alleles""" + graph_output: bool = False # Was class str + """Write debug assembly graph information to this file""" + gvcf_lod_band: list[float] = [-2.5, -2.0, -1.5, -1.0, -0.5, 0.0, 0.5, 1.0] + """Exclusive upper bounds for reference confidence LOD bands (must be specified in increasing order)""" + independent_mates: bool = False + """Allow paired reads to independently support different haplotypes. Useful for validations with ill-designed synthetic data""" + keep_boundary_flows: bool = False + """prevent spreading of boundary flows""" + kmer_size: list[int] = [10, 25] + """Kmer size to use in the read threading assembler""" + likelihood_calculation_engine: Implementation = Implementation.PAIRHMM + """What likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes""" + linked_de_bruijn_graph: bool = False + """If enabled, the Assembly Engine will construct a Linked De Bruijn graph to recover better haplotypes""" + max_mnp_distance: int = 1 + """Two or more phased substitutions separated by this distance or less are merged into MNPs""" + max_num_haplotypes_in_population: int = 128 + """Maximum number of haplotypes to consider for your population""" + max_prob_propagation_distance: int = 50 + """Upper limit on how many bases away probability mass can be moved around when calculating the boundaries between active and inactive assembly regions""" + max_suspicious_reads_per_alignment_start: int = 0 + """Maximum number of suspicious reads (mediocre mapping quality or too many substitutions) allowed in a downsampling stride. Set to 0 to disable""" + max_unpruned_variants: int = 100 + """Maximum number of variants in graph the adaptive pruner will allow""" + min_dangling_branch_length: int = 4 + """Minimum length of a dangling branch to attempt recovery""" + min_pruning: int = 2 + """Minimum support to not prune paths in the graph""" + minimum_allele_fraction: float = 0.0 + """Lower bound of variant allele fractions to consider when calculating variant LOD""" + num_pruning_samples: int = 1 + """Number of samples that must pass the minPruning threshold""" + pair_hmm_gap_continuation_penalty: int = 10 + """Flat gap continuation penalty for use in the Pair HMM""" + pair_hmm_implementation: PairHMMImplementation = PairHMMImplementation.FASTEST_AVAILABLE + """The PairHMM implementation to use for genotype likelihood calculations""" + pcr_indel_model: PCRErrorModel = PCRErrorModel.CONSERVATIVE + """The PCR indel model to use""" + pedigree: str | None = None # Was class GATKPath + """Pedigree file for determining the population 'founders'""" + phred_scaled_global_read_mismapping_rate: int = 45 + """The global assumed mismapping rate for reads""" + pileup_detection: bool = False + """If enabled, the variant caller will create pileup-based haplotypes in addition to the assembly-based haplotype generation""" + pruning_lod_threshold: float = 2.302585092994046 + """Ln likelihood ratio threshold for adaptive pruning algorithm""" + pruning_seeding_lod_threshold: float = 9.210340371976184 + """Ln likelihood ratio threshold for seeding subgraph of good variation in adaptive pruning algorithm""" + recover_all_dangling_branches: bool = False + """Recover all dangling branches""" + reference_model_deletion_quality: int = 30 + """The quality of deletion in the reference model""" + smith_waterman: SmithWatermanImplementation = SmithWatermanImplementation.JAVA + """Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice""" + smith_waterman_dangling_end_gap_extend_penalty: int = -6 + """Smith-Waterman gap-extend penalty for dangling-end recovery""" + smith_waterman_dangling_end_gap_open_penalty: int = -110 + """Smith-Waterman gap-open penalty for dangling-end recovery""" + smith_waterman_dangling_end_match_value: int = 25 + """Smith-Waterman match value for dangling-end recovery""" + smith_waterman_dangling_end_mismatch_penalty: int = -50 + """Smith-Waterman mismatch penalty for dangling-end recovery""" + smith_waterman_haplotype_to_reference_gap_extend_penalty: int = -11 + """Smith-Waterman gap-extend penalty for haplotype-to-reference alignment""" + smith_waterman_haplotype_to_reference_gap_open_penalty: int = -260 + """Smith-Waterman gap-open penalty for haplotype-to-reference alignment""" + smith_waterman_haplotype_to_reference_match_value: int = 200 + """Smith-Waterman match value for haplotype-to-reference alignment""" + smith_waterman_haplotype_to_reference_mismatch_penalty: int = -150 + """Smith-Waterman mismatch penalty for haplotype-to-reference alignment""" + smith_waterman_read_to_haplotype_gap_extend_penalty: int = -5 + """Smith-Waterman gap-extend penalty for read-to-haplotype alignment""" + smith_waterman_read_to_haplotype_gap_open_penalty: int = -30 + """Smith-Waterman gap-open penalty for read-to-haplotype alignment""" + smith_waterman_read_to_haplotype_match_value: int = 10 + """Smith-Waterman match value for read-to-haplotype alignment""" + smith_waterman_read_to_haplotype_mismatch_penalty: int = -15 + """Smith-Waterman mismatch penalty for read-to-haplotype alignment""" + soft_clip_low_quality_ends: bool = False + """If enabled will preserve low-quality read ends as softclips (used for DRAGEN-GATK BQD genotyper model)""" + + maximum_mapping_quality: int | None = None + """Maximum mapping quality to keep (inclusive)""" + minimum_mapping_quality: int = 20 + """Minimum mapping quality to keep (inclusive)""" + max_read_length: int = 2147483647 + """Keep only reads with length at most equal to the specified value""" + min_read_length: int = 30 + """Keep only reads with length at least equal to the specified value""" + + # Arguments omitted + + # f1r2_tar_gz: File | None = None # No options for class File + # """If specified, collect F1R2 counts and output files into this tar.gz file""" + # gcs_max_retries: int = 20 + # """If the GCS bucket channel errors out, how many times it will attempt to re-initiate the connection""" + # gcs_project_for_requester_pays: str = "." + # """Project to bill when accessing "requester pays" buckets. If unset, these buckets cannot be accessed. User must have storage.buckets.get permission on the bucket being accessed""" + # mutect3_alt_downsample: int = 20 + # """Downsample alt reads to this count for Mutect3 training datasets""" + # mutect3_dataset: File | None = None # No options for class File + # """Destination for Mutect3 data collection""" + # mutect3_non_artifact_ratio: int = 20 + # """Number of non-artifact data per artifact datum in Mutect3 training""" + # mutect3_ref_downsample: int = 10 + # """Downsample ref reads to this count when generating a Mutect3 dataset""" + # mutect3_training_mode: bool = False + # """Collect Mutect3 data for learning""" + # mutect3_training_truth: FeatureInput | None = None # No options for class FeatureInput + # """VCF file of known variants for labeling Mutect3 training data""" + # native_pair_hmm_threads: int = 4 + # """How many threads should a native pairHMM implementation use""" + # read_index: list[GATKPath] = [] # No options for class GATKPath + # """Indices to use for the read inputs. If specified, an index must be provided for every read input and in the same order as the read inputs. If this argument is not specified, the path to the index for each input will be inferred automatically""" + # sequence_dictionary: GATKPath | None = None # No options for class GATKPath + # """Use the given sequence dictionary as the master/canonical sequence dictionary. Must be a .dict file""" + # tmp_dir: GATKPath | None = None # No options for class GATKPath + # """Temp directory to use""" + # cloud_index_prefetch_buffer: int = -1 + # """Size of the cloud-only prefetch buffer (in MB; 0 to disable). Defaults to cloudPrefetchBuffer if unset""" + # cloud_prefetch_buffer: int = 40 + # """Size of the cloud-only prefetch buffer (in MB; 0 to disable)""" + # create_output_bam_index: bool = True + # """If true, create a BAM/CRAM index when writing a coordinate-sorted BAM/CRAM file""" + # create_output_bam_md5: bool = False + # """If true, create a MD5 digest for any BAM/SAM/CRAM file created""" + # create_output_variant_index: bool = True + # """If true, create a VCF index when writing a coordinate-sorted VCF file""" + # create_output_variant_md5: bool = False + # """If true, create a a MD5 digest any VCF file created""" + # disable_sequence_dictionary_validation: bool = False + # """If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!""" + # help: bool = False + # """display the help message""" + # seconds_between_progress_updates: float = 10.0 + # """Output traversal statistics every time this many seconds elapse""" + # version: bool = False + # """display the version number for this tool""" + # showHidden: bool = False + # """display hidden arguments""" + # normal_sample: list[str] = [] + # """BAM sample name of normal(s), if any. May be URL-encoded as output by GetSampleName with -encode argument""" + # tumor_sample: str | None = None + # """BAM sample name of tumor. May be URL-encoded as output by GetSampleName with -encode argument""" + From a995721ccaf46a409fca2b7b0aca044c39b21a38 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 13 Nov 2024 17:36:00 +0100 Subject: [PATCH 34/46] style: making ruff happy --- snappy_pipeline/models/cnvkit.py | 8 +- snappy_pipeline/models/mutect2.py | 395 ++++++++++++++---------------- 2 files changed, 192 insertions(+), 211 deletions(-) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index f1782e588..402e567a6 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -74,7 +74,7 @@ class Target(SnappyModel): """Path to baits file (Agilent Covered), unset for WGS data""" split: bool = True """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float = (800/3) + avg_size: float = 800 / 3 """Average size of split target bins (results are approximate)""" short_names: bool = False """Reduce multi-accession bait labels to be short and consistent""" @@ -248,7 +248,11 @@ class CnvkitToReference(SnappyModel): @model_validator(mode="after") def ensure_males_for_reference(self): - if self.male_reference and self.sex.source == SexOrigin.CONFIG and self.sex.sample_sex == SexValue.FEMALE: + if ( + self.male_reference + and self.sex.source == SexOrigin.CONFIG + and self.sex.sample_sex == SexValue.FEMALE + ): raise ValueError("Male reference requested for female cohort") return self diff --git a/snappy_pipeline/models/mutect2.py b/snappy_pipeline/models/mutect2.py index d0ce7ec4b..5df87dcad 100644 --- a/snappy_pipeline/models/mutect2.py +++ b/snappy_pipeline/models/mutect2.py @@ -4,251 +4,235 @@ class Annotation(StrEnum): - AS_BASEQUALITYRANKSUMTEST = 'AS_BaseQualityRankSumTest' - AS_FISHERSTRAND = 'AS_FisherStrand' - AS_INBREEDINGCOEFF = 'AS_InbreedingCoeff' - AS_MAPPINGQUALITYRANKSUMTEST = 'AS_MappingQualityRankSumTest' - AS_QUALBYDEPTH = 'AS_QualByDepth' - AS_RMSMAPPINGQUALITY = 'AS_RMSMappingQuality' - AS_READPOSRANKSUMTEST = 'AS_ReadPosRankSumTest' - AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' - AS_STRANDODDSRATIO = 'AS_StrandOddsRatio' - ALLELEFRACTION = 'AlleleFraction' - ALLELEPSEUDODEPTH = 'AllelePseudoDepth' - ASSEMBLYCOMPLEXITY = 'AssemblyComplexity' - BASEQUALITY = 'BaseQuality' - BASEQUALITYHISTOGRAM = 'BaseQualityHistogram' - BASEQUALITYRANKSUMTEST = 'BaseQualityRankSumTest' - CHROMOSOMECOUNTS = 'ChromosomeCounts' - CLIPPINGRANKSUMTEST = 'ClippingRankSumTest' - COUNTNS = 'CountNs' - COVERAGE = 'Coverage' - CYCLESKIPSTATUS = 'CycleSkipStatus' - DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' - DEPTHPERSAMPLEHC = 'DepthPerSampleHC' - EXCESSHET = 'ExcessHet' - FEATURIZEDREADSETS = 'FeaturizedReadSets' - FISHERSTRAND = 'FisherStrand' - FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' - FRAGMENTLENGTH = 'FragmentLength' - GCCONTENT = 'GcContent' - GENOTYPESUMMARIES = 'GenotypeSummaries' - HAPLOTYPEFILTERINGANNOTATION = 'HaplotypeFilteringAnnotation' - HMERINDELLENGTH = 'HmerIndelLength' - HMERINDELNUC = 'HmerIndelNuc' - HMERMOTIFS = 'HmerMotifs' - INBREEDINGCOEFF = 'InbreedingCoeff' - INDELCLASSIFY = 'IndelClassify' - INDELLENGTH = 'IndelLength' - LIKELIHOODRANKSUMTEST = 'LikelihoodRankSumTest' - MAPPINGQUALITY = 'MappingQuality' - MAPPINGQUALITYRANKSUMTEST = 'MappingQualityRankSumTest' - MAPPINGQUALITYZERO = 'MappingQualityZero' - ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' - ORIGINALALIGNMENT = 'OriginalAlignment' - POSSIBLEDENOVO = 'PossibleDeNovo' - QUALBYDEPTH = 'QualByDepth' - RMSMAPPINGQUALITY = 'RMSMappingQuality' - RAWGTCOUNT = 'RawGtCount' - READPOSRANKSUMTEST = 'ReadPosRankSumTest' - READPOSITION = 'ReadPosition' - REFERENCEBASES = 'ReferenceBases' - SAMPLELIST = 'SampleList' - STRANDBIASBYSAMPLE = 'StrandBiasBySample' - STRANDODDSRATIO = 'StrandOddsRatio' - TANDEMREPEAT = 'TandemRepeat' - UNIQUEALTREADCOUNT = 'UniqueAltReadCount' - VARIANTTYPE = 'VariantType' - + AS_BASEQUALITYRANKSUMTEST = "AS_BaseQualityRankSumTest" + AS_FISHERSTRAND = "AS_FisherStrand" + AS_INBREEDINGCOEFF = "AS_InbreedingCoeff" + AS_MAPPINGQUALITYRANKSUMTEST = "AS_MappingQualityRankSumTest" + AS_QUALBYDEPTH = "AS_QualByDepth" + AS_RMSMAPPINGQUALITY = "AS_RMSMappingQuality" + AS_READPOSRANKSUMTEST = "AS_ReadPosRankSumTest" + AS_STRANDBIASMUTECTANNOTATION = "AS_StrandBiasMutectAnnotation" + AS_STRANDODDSRATIO = "AS_StrandOddsRatio" + ALLELEFRACTION = "AlleleFraction" + ALLELEPSEUDODEPTH = "AllelePseudoDepth" + ASSEMBLYCOMPLEXITY = "AssemblyComplexity" + BASEQUALITY = "BaseQuality" + BASEQUALITYHISTOGRAM = "BaseQualityHistogram" + BASEQUALITYRANKSUMTEST = "BaseQualityRankSumTest" + CHROMOSOMECOUNTS = "ChromosomeCounts" + CLIPPINGRANKSUMTEST = "ClippingRankSumTest" + COUNTNS = "CountNs" + COVERAGE = "Coverage" + CYCLESKIPSTATUS = "CycleSkipStatus" + DEPTHPERALLELEBYSAMPLE = "DepthPerAlleleBySample" + DEPTHPERSAMPLEHC = "DepthPerSampleHC" + EXCESSHET = "ExcessHet" + FEATURIZEDREADSETS = "FeaturizedReadSets" + FISHERSTRAND = "FisherStrand" + FRAGMENTDEPTHPERALLELEBYSAMPLE = "FragmentDepthPerAlleleBySample" + FRAGMENTLENGTH = "FragmentLength" + GCCONTENT = "GcContent" + GENOTYPESUMMARIES = "GenotypeSummaries" + HAPLOTYPEFILTERINGANNOTATION = "HaplotypeFilteringAnnotation" + HMERINDELLENGTH = "HmerIndelLength" + HMERINDELNUC = "HmerIndelNuc" + HMERMOTIFS = "HmerMotifs" + INBREEDINGCOEFF = "InbreedingCoeff" + INDELCLASSIFY = "IndelClassify" + INDELLENGTH = "IndelLength" + LIKELIHOODRANKSUMTEST = "LikelihoodRankSumTest" + MAPPINGQUALITY = "MappingQuality" + MAPPINGQUALITYRANKSUMTEST = "MappingQualityRankSumTest" + MAPPINGQUALITYZERO = "MappingQualityZero" + ORIENTATIONBIASREADCOUNTS = "OrientationBiasReadCounts" + ORIGINALALIGNMENT = "OriginalAlignment" + POSSIBLEDENOVO = "PossibleDeNovo" + QUALBYDEPTH = "QualByDepth" + RMSMAPPINGQUALITY = "RMSMappingQuality" + RAWGTCOUNT = "RawGtCount" + READPOSRANKSUMTEST = "ReadPosRankSumTest" + READPOSITION = "ReadPosition" + REFERENCEBASES = "ReferenceBases" + SAMPLELIST = "SampleList" + STRANDBIASBYSAMPLE = "StrandBiasBySample" + STRANDODDSRATIO = "StrandOddsRatio" + TANDEMREPEAT = "TandemRepeat" + UNIQUEALTREADCOUNT = "UniqueAltReadCount" + VARIANTTYPE = "VariantType" class AnnotationGroup(StrEnum): - AS_STANDARDANNOTATION = 'AS_StandardAnnotation' - ALLELESPECIFICANNOTATION = 'AlleleSpecificAnnotation' - GENOTYPEANNOTATION = 'GenotypeAnnotation' - INFOFIELDANNOTATION = 'InfoFieldAnnotation' - JUMBOGENOTYPEANNOTATION = 'JumboGenotypeAnnotation' - JUMBOINFOANNOTATION = 'JumboInfoAnnotation' - REDUCIBLEANNOTATION = 'ReducibleAnnotation' - STANDARDANNOTATION = 'StandardAnnotation' - STANDARDFLOWBASEDANNOTATION = 'StandardFlowBasedAnnotation' - STANDARDHCANNOTATION = 'StandardHCAnnotation' - STANDARDMUTECTANNOTATION = 'StandardMutectAnnotation' - VARIANTANNOTATION = 'VariantAnnotation' - + AS_STANDARDANNOTATION = "AS_StandardAnnotation" + ALLELESPECIFICANNOTATION = "AlleleSpecificAnnotation" + GENOTYPEANNOTATION = "GenotypeAnnotation" + INFOFIELDANNOTATION = "InfoFieldAnnotation" + JUMBOGENOTYPEANNOTATION = "JumboGenotypeAnnotation" + JUMBOINFOANNOTATION = "JumboInfoAnnotation" + REDUCIBLEANNOTATION = "ReducibleAnnotation" + STANDARDANNOTATION = "StandardAnnotation" + STANDARDFLOWBASEDANNOTATION = "StandardFlowBasedAnnotation" + STANDARDHCANNOTATION = "StandardHCAnnotation" + STANDARDMUTECTANNOTATION = "StandardMutectAnnotation" + VARIANTANNOTATION = "VariantAnnotation" class AnnotationExclude(StrEnum): - AS_STRANDBIASMUTECTANNOTATION = 'AS_StrandBiasMutectAnnotation' - BASEQUALITY = 'BaseQuality' - COVERAGE = 'Coverage' - DEPTHPERALLELEBYSAMPLE = 'DepthPerAlleleBySample' - DEPTHPERSAMPLEHC = 'DepthPerSampleHC' - FRAGMENTDEPTHPERALLELEBYSAMPLE = 'FragmentDepthPerAlleleBySample' - FRAGMENTLENGTH = 'FragmentLength' - MAPPINGQUALITY = 'MappingQuality' - ORIENTATIONBIASREADCOUNTS = 'OrientationBiasReadCounts' - READPOSITION = 'ReadPosition' - STRANDBIASBYSAMPLE = 'StrandBiasBySample' - TANDEMREPEAT = 'TandemRepeat' - + AS_STRANDBIASMUTECTANNOTATION = "AS_StrandBiasMutectAnnotation" + BASEQUALITY = "BaseQuality" + COVERAGE = "Coverage" + DEPTHPERALLELEBYSAMPLE = "DepthPerAlleleBySample" + DEPTHPERSAMPLEHC = "DepthPerSampleHC" + FRAGMENTDEPTHPERALLELEBYSAMPLE = "FragmentDepthPerAlleleBySample" + FRAGMENTLENGTH = "FragmentLength" + MAPPINGQUALITY = "MappingQuality" + ORIENTATIONBIASREADCOUNTS = "OrientationBiasReadCounts" + READPOSITION = "ReadPosition" + STRANDBIASBYSAMPLE = "StrandBiasBySample" + TANDEMREPEAT = "TandemRepeat" class DisableReadFilter(StrEnum): - GOODCIGARREADFILTER = 'GoodCigarReadFilter' - MAPPEDREADFILTER = 'MappedReadFilter' - MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' - MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' - MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' - NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' - NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' - NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' - NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' - PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' - READLENGTHREADFILTER = 'ReadLengthReadFilter' - WELLFORMEDREADFILTER = 'WellformedReadFilter' - + GOODCIGARREADFILTER = "GoodCigarReadFilter" + MAPPEDREADFILTER = "MappedReadFilter" + MAPPINGQUALITYAVAILABLEREADFILTER = "MappingQualityAvailableReadFilter" + MAPPINGQUALITYNOTZEROREADFILTER = "MappingQualityNotZeroReadFilter" + MAPPINGQUALITYREADFILTER = "MappingQualityReadFilter" + NONCHIMERICORIGINALALIGNMENTREADFILTER = "NonChimericOriginalAlignmentReadFilter" + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = "NonZeroReferenceLengthAlignmentReadFilter" + NOTDUPLICATEREADFILTER = "NotDuplicateReadFilter" + NOTSECONDARYALIGNMENTREADFILTER = "NotSecondaryAlignmentReadFilter" + PASSESVENDORQUALITYCHECKREADFILTER = "PassesVendorQualityCheckReadFilter" + READLENGTHREADFILTER = "ReadLengthReadFilter" + WELLFORMEDREADFILTER = "WellformedReadFilter" class IntervalMergingRule(StrEnum): - ALL = 'ALL' - OVERLAPPING_ONLY = 'OVERLAPPING_ONLY' - + ALL = "ALL" + OVERLAPPING_ONLY = "OVERLAPPING_ONLY" class IntervalSetRule(StrEnum): - INTERSECTION = 'INTERSECTION' - UNION = 'UNION' - + INTERSECTION = "INTERSECTION" + UNION = "UNION" class ReadFilter(StrEnum): - ALIGNMENTAGREESWITHHEADERREADFILTER = 'AlignmentAgreesWithHeaderReadFilter' - ALLOWALLREADSREADFILTER = 'AllowAllReadsReadFilter' - AMBIGUOUSBASEREADFILTER = 'AmbiguousBaseReadFilter' - CIGARCONTAINSNONOPERATOR = 'CigarContainsNoNOperator' - EXCESSIVEENDCLIPPEDREADFILTER = 'ExcessiveEndClippedReadFilter' - FIRSTOFPAIRREADFILTER = 'FirstOfPairReadFilter' - FLOWBASEDTPATTRIBUTESYMETRICREADFILTER = 'FlowBasedTPAttributeSymetricReadFilter' - FLOWBASEDTPATTRIBUTEVALIDREADFILTER = 'FlowBasedTPAttributeValidReadFilter' - FRAGMENTLENGTHREADFILTER = 'FragmentLengthReadFilter' - GOODCIGARREADFILTER = 'GoodCigarReadFilter' - HASREADGROUPREADFILTER = 'HasReadGroupReadFilter' - HMERQUALITYSYMETRICREADFILTER = 'HmerQualitySymetricReadFilter' - INTERVALOVERLAPREADFILTER = 'IntervalOverlapReadFilter' - JEXLEXPRESSIONREADTAGVALUEFILTER = 'JexlExpressionReadTagValueFilter' - LIBRARYREADFILTER = 'LibraryReadFilter' - MAPPEDREADFILTER = 'MappedReadFilter' - MAPPINGQUALITYAVAILABLEREADFILTER = 'MappingQualityAvailableReadFilter' - MAPPINGQUALITYNOTZEROREADFILTER = 'MappingQualityNotZeroReadFilter' - MAPPINGQUALITYREADFILTER = 'MappingQualityReadFilter' - MATCHINGBASESANDQUALSREADFILTER = 'MatchingBasesAndQualsReadFilter' - MATEDIFFERENTSTRANDREADFILTER = 'MateDifferentStrandReadFilter' - MATEDISTANTREADFILTER = 'MateDistantReadFilter' - MATEONSAMECONTIGORNOMAPPEDMATEREADFILTER = 'MateOnSameContigOrNoMappedMateReadFilter' - MATEUNMAPPEDANDUNMAPPEDREADFILTER = 'MateUnmappedAndUnmappedReadFilter' - METRICSREADFILTER = 'MetricsReadFilter' - NONCHIMERICORIGINALALIGNMENTREADFILTER = 'NonChimericOriginalAlignmentReadFilter' - NONZEROFRAGMENTLENGTHREADFILTER = 'NonZeroFragmentLengthReadFilter' - NONZEROREFERENCELENGTHALIGNMENTREADFILTER = 'NonZeroReferenceLengthAlignmentReadFilter' - NOTDUPLICATEREADFILTER = 'NotDuplicateReadFilter' - NOTOPTICALDUPLICATEREADFILTER = 'NotOpticalDuplicateReadFilter' - NOTPROPERLYPAIREDREADFILTER = 'NotProperlyPairedReadFilter' - NOTSECONDARYALIGNMENTREADFILTER = 'NotSecondaryAlignmentReadFilter' - NOTSUPPLEMENTARYALIGNMENTREADFILTER = 'NotSupplementaryAlignmentReadFilter' - OVERCLIPPEDREADFILTER = 'OverclippedReadFilter' - PAIREDREADFILTER = 'PairedReadFilter' - PASSESVENDORQUALITYCHECKREADFILTER = 'PassesVendorQualityCheckReadFilter' - PLATFORMREADFILTER = 'PlatformReadFilter' - PLATFORMUNITREADFILTER = 'PlatformUnitReadFilter' - PRIMARYLINEREADFILTER = 'PrimaryLineReadFilter' - PROPERLYPAIREDREADFILTER = 'ProperlyPairedReadFilter' - READGROUPBLACKLISTREADFILTER = 'ReadGroupBlackListReadFilter' - READGROUPHASFLOWORDERREADFILTER = 'ReadGroupHasFlowOrderReadFilter' - READGROUPREADFILTER = 'ReadGroupReadFilter' - READLENGTHEQUALSCIGARLENGTHREADFILTER = 'ReadLengthEqualsCigarLengthReadFilter' - READLENGTHREADFILTER = 'ReadLengthReadFilter' - READNAMEREADFILTER = 'ReadNameReadFilter' - READSTRANDFILTER = 'ReadStrandFilter' - READTAGVALUEFILTER = 'ReadTagValueFilter' - SAMPLEREADFILTER = 'SampleReadFilter' - SECONDOFPAIRREADFILTER = 'SecondOfPairReadFilter' - SEQISSTOREDREADFILTER = 'SeqIsStoredReadFilter' - SOFTCLIPPEDREADFILTER = 'SoftClippedReadFilter' - VALIDALIGNMENTENDREADFILTER = 'ValidAlignmentEndReadFilter' - VALIDALIGNMENTSTARTREADFILTER = 'ValidAlignmentStartReadFilter' - WELLFORMEDFLOWBASEDREADFILTER = 'WellformedFlowBasedReadFilter' - WELLFORMEDREADFILTER = 'WellformedReadFilter' - + ALIGNMENTAGREESWITHHEADERREADFILTER = "AlignmentAgreesWithHeaderReadFilter" + ALLOWALLREADSREADFILTER = "AllowAllReadsReadFilter" + AMBIGUOUSBASEREADFILTER = "AmbiguousBaseReadFilter" + CIGARCONTAINSNONOPERATOR = "CigarContainsNoNOperator" + EXCESSIVEENDCLIPPEDREADFILTER = "ExcessiveEndClippedReadFilter" + FIRSTOFPAIRREADFILTER = "FirstOfPairReadFilter" + FLOWBASEDTPATTRIBUTESYMETRICREADFILTER = "FlowBasedTPAttributeSymetricReadFilter" + FLOWBASEDTPATTRIBUTEVALIDREADFILTER = "FlowBasedTPAttributeValidReadFilter" + FRAGMENTLENGTHREADFILTER = "FragmentLengthReadFilter" + GOODCIGARREADFILTER = "GoodCigarReadFilter" + HASREADGROUPREADFILTER = "HasReadGroupReadFilter" + HMERQUALITYSYMETRICREADFILTER = "HmerQualitySymetricReadFilter" + INTERVALOVERLAPREADFILTER = "IntervalOverlapReadFilter" + JEXLEXPRESSIONREADTAGVALUEFILTER = "JexlExpressionReadTagValueFilter" + LIBRARYREADFILTER = "LibraryReadFilter" + MAPPEDREADFILTER = "MappedReadFilter" + MAPPINGQUALITYAVAILABLEREADFILTER = "MappingQualityAvailableReadFilter" + MAPPINGQUALITYNOTZEROREADFILTER = "MappingQualityNotZeroReadFilter" + MAPPINGQUALITYREADFILTER = "MappingQualityReadFilter" + MATCHINGBASESANDQUALSREADFILTER = "MatchingBasesAndQualsReadFilter" + MATEDIFFERENTSTRANDREADFILTER = "MateDifferentStrandReadFilter" + MATEDISTANTREADFILTER = "MateDistantReadFilter" + MATEONSAMECONTIGORNOMAPPEDMATEREADFILTER = "MateOnSameContigOrNoMappedMateReadFilter" + MATEUNMAPPEDANDUNMAPPEDREADFILTER = "MateUnmappedAndUnmappedReadFilter" + METRICSREADFILTER = "MetricsReadFilter" + NONCHIMERICORIGINALALIGNMENTREADFILTER = "NonChimericOriginalAlignmentReadFilter" + NONZEROFRAGMENTLENGTHREADFILTER = "NonZeroFragmentLengthReadFilter" + NONZEROREFERENCELENGTHALIGNMENTREADFILTER = "NonZeroReferenceLengthAlignmentReadFilter" + NOTDUPLICATEREADFILTER = "NotDuplicateReadFilter" + NOTOPTICALDUPLICATEREADFILTER = "NotOpticalDuplicateReadFilter" + NOTPROPERLYPAIREDREADFILTER = "NotProperlyPairedReadFilter" + NOTSECONDARYALIGNMENTREADFILTER = "NotSecondaryAlignmentReadFilter" + NOTSUPPLEMENTARYALIGNMENTREADFILTER = "NotSupplementaryAlignmentReadFilter" + OVERCLIPPEDREADFILTER = "OverclippedReadFilter" + PAIREDREADFILTER = "PairedReadFilter" + PASSESVENDORQUALITYCHECKREADFILTER = "PassesVendorQualityCheckReadFilter" + PLATFORMREADFILTER = "PlatformReadFilter" + PLATFORMUNITREADFILTER = "PlatformUnitReadFilter" + PRIMARYLINEREADFILTER = "PrimaryLineReadFilter" + PROPERLYPAIREDREADFILTER = "ProperlyPairedReadFilter" + READGROUPBLACKLISTREADFILTER = "ReadGroupBlackListReadFilter" + READGROUPHASFLOWORDERREADFILTER = "ReadGroupHasFlowOrderReadFilter" + READGROUPREADFILTER = "ReadGroupReadFilter" + READLENGTHEQUALSCIGARLENGTHREADFILTER = "ReadLengthEqualsCigarLengthReadFilter" + READLENGTHREADFILTER = "ReadLengthReadFilter" + READNAMEREADFILTER = "ReadNameReadFilter" + READSTRANDFILTER = "ReadStrandFilter" + READTAGVALUEFILTER = "ReadTagValueFilter" + SAMPLEREADFILTER = "SampleReadFilter" + SECONDOFPAIRREADFILTER = "SecondOfPairReadFilter" + SEQISSTOREDREADFILTER = "SeqIsStoredReadFilter" + SOFTCLIPPEDREADFILTER = "SoftClippedReadFilter" + VALIDALIGNMENTENDREADFILTER = "ValidAlignmentEndReadFilter" + VALIDALIGNMENTSTARTREADFILTER = "ValidAlignmentStartReadFilter" + WELLFORMEDFLOWBASEDREADFILTER = "WellformedFlowBasedReadFilter" + WELLFORMEDREADFILTER = "WellformedReadFilter" class ValidationStringency(StrEnum): - LENIENT = 'LENIENT' - SILENT = 'SILENT' - STRICT = 'STRICT' - + LENIENT = "LENIENT" + SILENT = "SILENT" + STRICT = "STRICT" class LogLevel(StrEnum): - DEBUG = 'DEBUG' - ERROR = 'ERROR' - INFO = 'INFO' - WARNING = 'WARNING' - + DEBUG = "DEBUG" + ERROR = "ERROR" + INFO = "INFO" + WARNING = "WARNING" class WriterType(StrEnum): - ALL_POSSIBLE_HAPLOTYPES = 'ALL_POSSIBLE_HAPLOTYPES' - CALLED_HAPLOTYPES = 'CALLED_HAPLOTYPES' - CALLED_HAPLOTYPES_NO_READS = 'CALLED_HAPLOTYPES_NO_READS' - NO_HAPLOTYPES = 'NO_HAPLOTYPES' - + ALL_POSSIBLE_HAPLOTYPES = "ALL_POSSIBLE_HAPLOTYPES" + CALLED_HAPLOTYPES = "CALLED_HAPLOTYPES" + CALLED_HAPLOTYPES_NO_READS = "CALLED_HAPLOTYPES_NO_READS" + NO_HAPLOTYPES = "NO_HAPLOTYPES" class ReferenceConfidenceMode(StrEnum): - BP_RESOLUTION = 'BP_RESOLUTION' - GVCF = 'GVCF' - NONE = 'NONE' - + BP_RESOLUTION = "BP_RESOLUTION" + GVCF = "GVCF" + NONE = "NONE" class FlowMode(StrEnum): - ADVANCED = 'ADVANCED' - NONE = 'NONE' - STANDARD = 'STANDARD' - + ADVANCED = "ADVANCED" + NONE = "NONE" + STANDARD = "STANDARD" class Implementation(StrEnum): - FLOWBASED = 'FlowBased' - FLOWBASEDHMM = 'FlowBasedHMM' - PAIRHMM = 'PairHMM' - + FLOWBASED = "FlowBased" + FLOWBASEDHMM = "FlowBasedHMM" + PAIRHMM = "PairHMM" class PairHMMImplementation(StrEnum): - AVX_LOGLESS_CACHING = 'AVX_LOGLESS_CACHING' - AVX_LOGLESS_CACHING_OMP = 'AVX_LOGLESS_CACHING_OMP' - EXACT = 'EXACT' - FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' - LOGLESS_CACHING = 'LOGLESS_CACHING' - ORIGINAL = 'ORIGINAL' - + AVX_LOGLESS_CACHING = "AVX_LOGLESS_CACHING" + AVX_LOGLESS_CACHING_OMP = "AVX_LOGLESS_CACHING_OMP" + EXACT = "EXACT" + FASTEST_AVAILABLE = "FASTEST_AVAILABLE" + LOGLESS_CACHING = "LOGLESS_CACHING" + ORIGINAL = "ORIGINAL" class PCRErrorModel(StrEnum): - AGGRESSIVE = 'AGGRESSIVE' - CONSERVATIVE = 'CONSERVATIVE' - HOSTILE = 'HOSTILE' - NONE = 'NONE' - + AGGRESSIVE = "AGGRESSIVE" + CONSERVATIVE = "CONSERVATIVE" + HOSTILE = "HOSTILE" + NONE = "NONE" class SmithWatermanImplementation(StrEnum): - AVX_ENABLED = 'AVX_ENABLED' - FASTEST_AVAILABLE = 'FASTEST_AVAILABLE' - JAVA = 'JAVA' - + AVX_ENABLED = "AVX_ENABLED" + FASTEST_AVAILABLE = "FASTEST_AVAILABLE" + JAVA = "JAVA" class Mutect2(SnappyModel): @@ -265,11 +249,6 @@ class Mutect2(SnappyModel): germline_resource: str | None = None # No options for class FeatureInput """Population vcf of germline sequencing containing allele fractions""" - # Arguments that must be set by derived classes (pon & calling) - - # Panel of normals arguments - - # Calling-specific arguments # panel_of_normals: str | None = None # Was class FeatureInput @@ -293,7 +272,6 @@ class Mutect2(SnappyModel): # pair_hmm_results_file: bool = False # Was class GATKPath # """Write exact pairHMM inputs/outputs to for debugging purposes""" - # Optional arguments add_output_vcf_command_line: bool = True @@ -315,7 +293,7 @@ class Mutect2(SnappyModel): callable_depth: int = 10 """Minimum depth to be considered callable for Mutect stats. Does not affect genotyping""" disable_bam_index_caching: bool = False - """If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified. Caching is automatically disabled if there are no intervals specified""" + """If true, don"t cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified. Caching is automatically disabled if there are no intervals specified""" disable_read_filter: list[DisableReadFilter] = [] """Read filters to be disabled before analysis""" dont_use_dragstr_pair_hmm_scores: bool = False @@ -381,7 +359,7 @@ class Mutect2(SnappyModel): read_validation_stringency: ValidationStringency = ValidationStringency.SILENT """Validation stringency for all SAM/BAM/CRAM/SRA files read by this program. The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded""" sites_only_vcf_output: bool = False - """If true, don't emit genotype fields when writing vcf file output""" + """If true, don"t emit genotype fields when writing vcf file output""" tumor_lod_to_emit: float = 3.0 """Log 10 odds threshold to emit variant to VCF""" use_jdk_deflater: bool = False @@ -609,4 +587,3 @@ class Mutect2(SnappyModel): # """BAM sample name of normal(s), if any. May be URL-encoded as output by GetSampleName with -encode argument""" # tumor_sample: str | None = None # """BAM sample name of tumor. May be URL-encoded as output by GetSampleName with -encode argument""" - From 6dca7bdf7ec57b5e16a26070ac1ae5a232a163fd Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 13 Nov 2024 17:37:51 +0100 Subject: [PATCH 35/46] refactor: Moving & renaming the library kit definition --- .../workflows/ngs_mapping/model.py | 27 ++----------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 1f64a02be..b6e05bada 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -7,6 +7,7 @@ from pydantic import Field, field_validator, model_validator from snappy_pipeline.models import EnumField, SizeString, SnappyModel, SnappyStepModel +from snappy_pipeline.models.library_kit import LibraryKit class DnaMapper(Enum): @@ -49,30 +50,6 @@ class Tools(SnappyModel): """Required if long-read mapper used; otherwise, leave empty.""" -class TargetCoverageReportEntry(SnappyModel): - """ - Mapping from enrichment kit to target region BED file, for either computing per--target - region coverage or selecting targeted exons. - - The following will match both the stock IDT library kit and the ones - with spike-ins seen fromr Yale genomics. The path above would be - mapped to the name "default". - - name: IDT_xGen_V1_0 - pattern: "xGen Exome Research Panel V1\\.0*" - path: "path/to/targets.bed" - """ - - name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] - - pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] - - path: Annotated[str, Field(examples=["path/to/targets.bed"])] - - -class TargetCoverageReport(SnappyModel): - path_target_interval_list_mapping: list[TargetCoverageReportEntry] = [] - - class BamCollectDoc(SnappyModel): enabled: bool = False window_length: Annotated[int, Field(gt=0)] = 1000 @@ -283,7 +260,7 @@ class NgsMapping(SnappyStepModel): path_link_in: str = "" """OPTIONAL Override data set configuration search paths for FASTQ files""" - target_coverage_report: TargetCoverageReport | None = None + target_coverage_report: LibraryKit | None = None """Thresholds for targeted sequencing coverage QC.""" bam_collect_doc: BamCollectDoc = BamCollectDoc() From a31ad8d700476786a25bcaf6950d5681e1ee0a96 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 25 Nov 2024 15:20:03 +0100 Subject: [PATCH 36/46] feat: somatic cnv calling for wes & wgs [cnvkit tool only] --- snappy_pipeline/models/cnvkit.py | 245 ++- .../workflows/abstract/__init__.py | 6 +- .../workflows/somatic_cnv_calling/__init__.py | 1421 ++++++++++------- .../somatic_cnv_calling/cnvkit.rules | 202 +-- .../workflows/somatic_cnv_calling/model.py | 491 ++---- .../wrappers/cnvkit/access/wrapper.py | 6 +- .../wrappers/cnvkit/antitarget/wrapper.py | 25 +- .../wrappers/cnvkit/autobin/wrapper.py | 6 +- .../wrappers/cnvkit/bintest/environment.yaml | 1 + .../wrappers/cnvkit/bintest/wrapper.py | 32 + .../wrappers/cnvkit/call/wrapper.py | 72 +- .../wrappers/cnvkit/coverage/wrapper.py | 6 +- .../wrappers/cnvkit/fix/wrapper.py | 27 +- .../cnvkit/plot/scatter/environment.yaml | 1 + .../wrappers/cnvkit/plot/scatter/wrapper.py | 56 + .../wrappers/cnvkit/reference/wrapper.py | 27 +- .../report/genemetrics/environment.yaml | 1 + .../cnvkit/report/genemetrics/wrapper.py | 35 + .../cnvkit/report/metrics/environment.yaml | 1 + .../wrappers/cnvkit/report/metrics/wrapper.py | 29 + .../cnvkit/report/segmetrics/environment.yaml | 1 + .../cnvkit/report/segmetrics/wrapper.py | 33 + .../wrappers/cnvkit/segment/wrapper.py | 34 +- .../wrappers/cnvkit/target/wrapper.py | 26 +- tests/snappy_pipeline/workflows/conftest.py | 35 + .../test_workflow_somatic_cnv_calling.py | 580 +++++++ 26 files changed, 2172 insertions(+), 1227 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py create mode 100644 tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 402e567a6..3e9be3f55 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -6,6 +6,61 @@ from snappy_pipeline.models import SnappyModel +# Parameters for each action & those shared between actions +# param_table = { +# "shared": { +# "short_names": bool, +# "drop_low_coverage": bool, +# "male_reference": bool, +# "sample_sex": Enum, +# "zigocity_freq": float, +# "min_variant_depth": int, +# "diploid_parx_genome": str, +# "normal_id": str, +# "sample_id": str, +# "cluster": bool, +# }, +# "access": {"min_gap_size": int, "exclude": list}, +# "antitarget": {"avg_size": int, "min_size": float}, +# "autobin": { +# "method": Enum, +# "bp_per_bin": float, +# "antitarget_max_size": int, +# "antitarget_min_size": int, +# "target_max_size": int, +# "target_min_size": int, +# }, +# "bintest": {"target": bool, "alpha": float}, +# "call": { +# "center": Enum, +# "filter": Enum, +# "method": Enum, +# "center_at": float, +# "purity": float, +# "ploidy": float, +# "thresholds": list, +# }, +# "coverage": {"count": bool, "min_mapq": int}, +# "fix": {"smoothing_window_fraction": float}, +# "genemetrics": {"alpha": float, "threshold": float, "bootstrap": int}, +# "metrics": {}, +# "reference": {"min_cluster_size": int}, +# "segment": { +# "smooth_cbs": bool, +# "method": Enum, +# "threshold": float, +# "drop_outliers": int, +# }, +# "segmetrics": { +# "alpha": float, +# "threshold": float, +# "bootstrap": int, +# "min_probes": int, +# }, +# "target": {"split": bool, "avg_size": float}, +# } + + class SexOrigin(enum.StrEnum): AUTOMATIC = "auto" """Sex determined from the data""" @@ -33,51 +88,72 @@ def ensure_valid_sex_value(self): class SegmentationMethod(enum.StrEnum): - cbs = "cbs" - flasso = "flasso" - haar = "haar" - hmm = "hmm" - hmm_tumor = "hmm-tumor" - hmm_germline = "hmm-germline" - none = "none" + CBS = "cbs" + FLASSO = "flasso" + HAAR = "haar" + HMM = "hmm" + HMM_TUMOR = "hmm-tumor" + HMM_GERMLINE = "hmm-germline" + NONE = "none" class CenterMethod(enum.StrEnum): - mean = "mean" - median = "median" - mode = "mode" - biweight = "biweight" + MEAN = "mean" + MEDIAN = "median" + MODE = "mode" + BIWEIGHT = "biweight" class FilterMethod(enum.StrEnum): - ampdel = "ampdel" - cn = "cn" - ci = "ci" - sem = "sem" + AMPDEL = "ampdel" + CN = "cn" + CI = "ci" + SEM = "sem" class CallingMethod(enum.StrEnum): - threshold = "threshold" - clonal = "clonal" - none = "" + THRESHOLD = "threshold" + CLONAL = "clonal" + NONE = "none" class Access(SnappyModel): exclude: list[str] = [] """Regions accessible to mapping""" - min_gap_size: int = 5000 - """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" + min_gap_size: int | None = None + """ + Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together. + + In WGS mode, the _target_ regions are set to the accessible regions in the genome. + These accessible regions can be provided by the user, or computed by the `access` + module. In the latter case, the optimal bin size is computed by the `autobin` module + unless this value is provided by the user. + `autobin` uses the `wgs` method _only_ if the list of excluded region is empty and if + the `min_gap_size` parameter remains unassigned. If any of these conditions is not met, + or if a files of accessible regions is provided by the user, then then `amplicon` method + is used. + It is recommended to leave the excluded regions empty and not set the `min_gap_size` + parameter for WGS data, unless the accessible regions are much reduced (for example excluding + all intergenic regions, repeats, low complexity, ...) + """ class Target(SnappyModel): - path_baits: str | None = None - """Path to baits file (Agilent Covered), unset for WGS data""" split: bool = True """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float = 800 / 3 - """Average size of split target bins (results are approximate)""" - short_names: bool = False - """Reduce multi-accession bait labels to be short and consistent""" + avg_size: float | None = None + """ + Average size of split target bins (results are approximate). + + When the parameter is left unassigned, the cnvkit default is used for WES data, + and an optimal value is computed for WGS data, if there is data for normal control(s). + """ + short_names: bool = True + """ + Reduce multi-accession bait labels to be short and consistent. + + Only valid when a gff/gtf features file is defined in the static part of the configuration. + """ class Antitarget(SnappyModel): @@ -101,9 +177,13 @@ class Fix(SnappyModel): class Segment(SnappyModel): method: SegmentationMethod = SegmentationMethod.CBS - """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" - threshold: float = 0.0001 - """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" + """Segmentation method, or 'none' for chromosome arm-level averages as segments""" + threshold: float + """ + Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. + + For HMM methods, this is the smoothing window size. + """ drop_outliers: int = 10 """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" smooth_cbs: bool = False @@ -116,21 +196,25 @@ def ensure_smooth_for_cbs_only(self) -> Self: class Call(SnappyModel): - method: CallingMethod | None = None + method: CallingMethod = CallingMethod.THRESHOLD """Calling method.""" thresholds: list[float] = [-1.1, -0.25, 0.2, 0.7] - """Hard thresholds for calling each integer copy number, separated by commas""" - center: CenterMethod = CenterMethod.MEDIAN + """Hard thresholds for calling each integer copy number""" + center: CenterMethod | None = None """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" center_at: float | None = None - """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" + """ + Subtract a constant number from all log2 ratios. For "manual" re-centering. + + When this parameter is set, the centering method should be left empty. + """ filter: FilterMethod | None = None """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" @model_validator(mode="after") - def avoid_center_center_at_conflict(self) -> Self: - if self.center is not None and self.center_at is not None: - raise ValueError("'call' options 'center' and 'center_at' cannot be used together") + def ensure_center_without_center_at(self) -> Self: + if self.center_at is not None and self.center is not None: + raise ValueError("'center' and 'center_at' parameters cannot be used together") return self @@ -157,9 +241,11 @@ class PlotDiagram(Plot): class PlotScatter(Plot): path_range_list: str | None = None - """File listing the chromosomal ranges to display, as BED, interval list or 'chr:start-end' text""" + """File listing the chromosomal ranges to display, as BED, interval list or 'chr:start-end' text (currently not implemented)""" + chromosome: str | None = None + """Name of the chromosome to display (whole genome if empty)""" gene: str | None = None - """Name of gene or genes (comma-separated) to display.""" + """Name of gene or genes (comma-separated) to display (currently not implemented)""" width: int = 1000000 """Width of margin to show around the selected gene(s)""" antitarget_marker: str = "o" @@ -188,6 +274,21 @@ class Report(SnappyModel): enabled: bool = True +class ReportStats(enum.StrEnum): + MEAN = "mean" + MEDIAN = "median" + MODE = "mode" + T_TEST = "t-test" + STDEV = "stdev" + SEM = "sem" + MAD = "mad" + MSE = "mse" + IQR = "iqr" + BIVAR = "bivar" + CI = "ci" + PI = "pi" + + class ReportSegmetrics(Report): alpha: float = 0.05 """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" @@ -195,6 +296,20 @@ class ReportSegmetrics(Report): """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" smooth_bootstrap: bool = False """Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval""" + stats: list[ReportStats] = [ + ReportStats.MEAN, + ReportStats.MEDIAN, + ReportStats.MODE, + ReportStats.T_TEST, + ReportStats.STDEV, + ReportStats.SEM, + ReportStats.MAD, + ReportStats.MSE, + ReportStats.IQR, + ReportStats.BIVAR, + ReportStats.CI, + ReportStats.PI, + ] class ReportGenemetrics(Report): @@ -206,24 +321,33 @@ class ReportGenemetrics(Report): """Copy number change threshold to report a gene gain/loss""" min_probes: int = 3 """Minimum number of covered probes to report a gain/loss""" - - -class Report(enum.StrEnum): - GENEMETRICS = "genemetrics" - SEGMETRICS = "segmetrics" + stats: list[ReportStats] = [ + ReportStats.MEAN, + ReportStats.MEDIAN, + ReportStats.MODE, + ReportStats.T_TEST, + ReportStats.STDEV, + ReportStats.SEM, + ReportStats.MAD, + ReportStats.MSE, + ReportStats.IQR, + ReportStats.BIVAR, + ReportStats.CI, + ReportStats.PI, + ] class CnvkitToReference(SnappyModel): # Substep-secific parameters - access: Access - target: Target - antitarget: Antitarget + access: Access = Access() + target: Target = Target() + antitarget: Antitarget = Antitarget() - coverage: Coverage + coverage: Coverage = Coverage() - metrics: Report - segmetrics: ReportSegmetrics - genemetrics: ReportGenemetrics + metrics: Report = Report() + segmetrics: ReportSegmetrics = ReportSegmetrics() + genemetrics: ReportGenemetrics = ReportGenemetrics() # Generic parameters (used in different substeps & must agree) male_reference: bool = False @@ -236,11 +360,11 @@ class CnvkitToReference(SnappyModel): min_cluster_size: int = 4 """Minimum cluster size to keep in reference profiles.""" - gc: bool = False + gc: bool = True """Skip GC correction.""" - edge: bool = None + edge: bool | None = None """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" - rmask: bool = False + rmask: bool = True """Skip RepeatMasker correction.""" drop_low_coverage: bool = False @@ -258,15 +382,10 @@ def ensure_males_for_reference(self): class Cnvkit(CnvkitToReference): - fix: Fix + fix: Fix = Fix() segment: Segment - call: Call - bintest: Bintest - - diagram: PlotDiagram - scatter: PlotScatter + call: Call = Call() + bintest: Bintest = Bintest() - min_variant_depth: int = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" - zygocity_freq: float = 0.25 - """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" + diagram: PlotDiagram = PlotDiagram() + scatter: PlotScatter = PlotScatter() diff --git a/snappy_pipeline/workflows/abstract/__init__.py b/snappy_pipeline/workflows/abstract/__init__.py index 6e1e66367..77a3bd1fd 100644 --- a/snappy_pipeline/workflows/abstract/__init__.py +++ b/snappy_pipeline/workflows/abstract/__init__.py @@ -176,7 +176,7 @@ def _get_resource(wildcards: Wildcards = None, input: InputFiles = None) -> Any: return _get_resource - def get_args(self, action: str) -> Inputs | Callable[[Wildcards], Inputs]: + def get_args(self, action: str) -> Inputs | Callable[[Wildcards, InputFiles], Inputs]: """Return args for the given action of the sub step""" raise NotImplementedError("Called abstract method. Override me!") # pragma: no cover @@ -873,7 +873,9 @@ def register_sub_workflow( ) self.sub_workflows[sub_workflow_name] = self.workflow.globals[sub_workflow_name] - def get_args(self, sub_step: str, action: str) -> Inputs | Callable[[Wildcards], Inputs]: + def get_args( + self, sub_step: str, action: str + ) -> Inputs | Callable[[Wildcards, InputFiles], Inputs]: """Return arguments for action of substep with given wildcards Delegates to the sub step object's get_args function diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 70bd2c447..7397391ef 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -154,11 +154,14 @@ import os import os.path import re -import typing +from copy import deepcopy +from enum import Enum +from typing import Callable, Iterator, Iterable, NamedTuple, Any from biomedsheets.models import BioEntity, BioSample, TestSample, NGSLibrary from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background -from snakemake.io import OutputFiles, Wildcards +from biomedsheets.io_tsv.base import LIBRARY_TYPES, LIBRARY_TO_EXTRACTION, EXTRACTION_TYPE_DNA +from snakemake.io import OutputFiles, Wildcards, InputFiles from snappy_pipeline.utils import dictify from snappy_pipeline.workflows.abstract import ( @@ -169,8 +172,11 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_pipeline.models.cnvkit import SegmentationMethod as CnvkitSegmentationMethod + from .model import SomaticCnvCalling as SomaticCnvCallingConfigModel -from .model import Sex, LibraryKitDefinition, PanelOfNormalsOrigin +from .model import Cnvkit as CnvkitConfig +from .model import Sex, SexOrigin, SexValue, PanelOfNormalsOrigin, PurityOrigin, VariantOrigin __author__ = "Eric Blanc " @@ -204,18 +210,9 @@ class SomaticCnvCallingStepPart(BaseStepPart): def __init__(self, parent: "SomaticCnvCallingWorkflow"): super().__init__(parent) - def _get_sample_sex(self, library_name: str) -> Sex: - if self.config.sex == Sex.MALE or self.config.sex == Sex.FEMALE: - sample_sex = self.config.sex - elif self.config.sex == Sex.SAMPLESHEET and library_name in self.parent.sex: - sample_sex = self.parent.sex[library_name] - else: - sample_sex = Sex.UNKNOWN - return sample_sex - @staticmethod @dictify - def _get_log_file_from_prefix(prefix: str) -> typing.Iterator[typing.Dict[str, str]]: + def _get_log_file_from_prefix(prefix: str) -> Iterator[dict[str, str]]: key_ext = ( ("log", ".log"), ("sh", ".sh"), @@ -236,20 +233,19 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): #: Class available actions actions = ( "access", + "autobin", "target", "antitarget", "coverage", "reference", - "flat_reference_panel", - "flat_reference_wgs", "fix", "segment", "call", "bintest", - "plot/diagram", - "plot/scatter", - "report/metrics", - "report/segmetrics", + "scatter", + "metrics", + "genemetrics", + "segmetrics", ) # Overwrite defaults @@ -258,518 +254,767 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): def __init__(self, parent: SomaticCnvCallingStepPart): super().__init__(parent) - def get_input_files(self, action: str) -> typing.Callable: + self.is_wgs = ( + any([libraryKit is None for libraryKit in self.parent.tumors.keys()]) + and self.name in self.config.tools.wgs + ) + self.is_wes = ( + any([libraryKit is not None for libraryKit in self.parent.tumors.keys()]) + and self.name in self.config.tools.wes + ) + assert not (self.is_wgs and self.is_wes), "WES & WGS are mixed" + + if self.is_wgs or self.is_wes: + assert ( + len(self.parent.tumors) == 1 + ), "Current cnvkit tool implementation can't handle multiple library types or kits" + + self.libraryKit = list(self.parent.tumors.keys())[0] + self.tumors = {x.library.name: x for x in self.parent.tumors[self.libraryKit]} + + self.cfg: CnvkitConfig = self.config.get(self.name) + self.pon_source = ( + self.cfg.panel_of_normals.source if self.cfg.panel_of_normals.enabled else None + ) + + self._set_cnvkit_pipeline_logic() + + self.path_baits = self._get_path_baits() + + if ( + self.cfg.somatic_purity_ploidy_estimate.enabled + and self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.SAMPLESHEET + ): + assert not any( + [x.purity is None for x in self.tumors.values()] + ), "Missing purity value from samplesheet" + + self.base_out = "work/{mapper}.cnvkit/out/cnvkit." + self.base_out_lib = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." + ) + + def _set_cnvkit_pipeline_logic(self): + """ + Creates instance variables to choose path in cnvkit pipeline + + Access: regions accessible for CNV calling (unmasked) + path_access or when missing build from genome reference + optional list of excluded region + + Target: regions of good coverage + From baits (WES) or accessible regions (WGS) + estimate of target size from config or autobin step + + Antitarget: regions of low coverage + antitarget = access - target, only WES, otherwise empty + + Reference: + Flat: based on targets & antitargets only + Cohort: from panel_of_normals step + File: from another cohort or public data (reference + target + antitarget [WES only]) + Paired (panel of normal disabled): reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) + + Therefore, a reference must be created for flat & paired choices (one reference per normal sample in the latter case). + The logic to create the reference is (panel of normal is pon): + - access created if path_access is missing or average target size estimated + - average target size estimated if value not in config and dataset is WGS + - target created always + - antitarget created when dataset is WES + """ + self.paired = not self.cfg.panel_of_normals.enabled + self.build_ref = self.paired or self.pon_source == PanelOfNormalsOrigin.FLAT + self.compute_avg_target_size = ( + self.is_wgs and self.paired and self.cfg.target.avg_size is None + ) + self.create_access = self.build_ref and (not self.cfg.path_access) + self.plain_access = ( + not self.cfg.path_access + and len(self.cfg.access.exclude) == 0 + and self.cfg.access.min_gap_size is None + ) + + self.variants_from_cohort = ( + self.cfg.somatic_variant_calling.enabled + and self.cfg.somatic_variant_calling.source == VariantOrigin.COHORT + ) + self.variants_from_file = ( + self.cfg.somatic_variant_calling.enabled + and self.cfg.somatic_variant_calling.source == VariantOrigin.FILE + ) + + def _get_sample_sex(self, library_name: str | None) -> SexValue | None: + if self.cfg.sample_sex.source == SexOrigin.SAMPLESHEET and library_name: + sample_sex = self.tumors[library_name].sex + elif self.cfg.sample_sex.source == SexOrigin.CONFIG: + sample_sex = self.cfg.sample_sex.default + else: + sample_sex = None + return sample_sex + + def _get_path_baits(self) -> str | None: + if not self.is_wes: + return None + default = None + for item in self.cfg.path_target_interval_list_mapping: + if item.name == self.libraryKit: + return item.path + elif item.name == "__default__": + default = item.path + if default is None: + raise ValueError(f"Missing library kit definition for {self.libraryKit}") + return default + + def get_input_files(self, action: str) -> Callable: """Return input paths input function, dependent on rule""" # Validate action self._validate_action(action) return getattr(self, "_get_input_files_{}".format(action.replace("/", "_"))) - def get_params(self, action: str) -> typing.Callable: + def get_args(self, action: str) -> Callable: """Return parameters input function, dependent on rule""" # Validate action self._validate_action(action) - return getattr(self, "_get_params_{}".format(action.replace("/", "_"))) + return getattr(self, "_get_args_{}".format(action.replace("/", "_"))) - def get_output_files(self, action: str) -> typing.Callable: - """Return input paths input function, dependent on rule""" - # Validate action + @dictify + def get_output_files(self, action: str): + """ + Return output paths, dependent on rule + + It is important to take good care of wildcards, because + when a paired reference is used on WGS without setting the avg target size, + the output of autobin and target are built for the normal library. + So in this case, library_name stands for the normal library, rather than + for the tumor. + """ self._validate_action(action) - f = getattr(self, "_get_output_files_{}".format(action.replace("/", "_"))) - return f() - def get_log_file(self, action: str) -> typing.Dict[str, str]: - """Return log files, dependent on rule""" + base_report_lib = ( + "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}." + ) + + output_files = {} + match action: + case "access": + output_files = {"access": self.base_out + "access.bed"} + case "autobin": + output_files = {"result": self.base_out_lib + "autobin.txt"} + case "target": + if self.compute_avg_target_size and self.paired: + output_files = {"target": self.base_out_lib + "target.bed"} + else: + output_files = {"target": self.base_out + "target.bed"} + case "antitarget": + output_files = {"antitarget": self.base_out + "antitarget.bed"} + case "coverage": + output_files = {"coverage": self.base_out_lib + "{region,(target|antitarget)}.cnn"} + case "reference": + if self.paired: + output_files = {"reference": self.base_out_lib + "reference.cnn"} + else: + output_files = {"reference": self.base_out + "reference.cnn"} + case "fix": + output_files = {"ratios": self.base_out_lib + "cnr"} + case "segment": + output_files = { + "segments": self.base_out_lib + "segments.cns", + "dataframe": self.base_out_lib + "rds", + } + case "call": + output_files = {"calls": self.base_out_lib + "cns"} + case "bintest": + output_files = {"tests": self.base_out_lib + "bintest.cns"} + case "metrics": + output_files = {"report": base_report_lib + "metrics.tsv"} + case "segmetrics": + output_files = {"report": base_report_lib + "segmetrics.tsv"} + case "genemetrics": + output_files = {"report": base_report_lib + "genemetrics.tsv"} + case "scatter": + output_files = { + "plot": "work/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.scatter.{contig_name}.jpeg" + } + + for k, v in output_files.items(): + yield k, v + yield k + "_md5", v + ".md5" + + @dictify + def get_log_file(self, action): + """Return panel of normal files""" # Validate action self._validate_action(action) - base_name = os.path.join("work", f"{{mapper}}.{self.name}.{{library_name}}", "log") - # Access, target & antitarget steps are cohort-wide, the others are library-dependent - if action in ("access",): - prefix = f"work/{self.name}/log/{action}" - elif action in ("target", "antitarget"): - prefix = f"work/{self.name}/log/{action}" + ".{panel_name}" - elif action in ("coverage",): - prefix = os.path.join(base_name, action + ".{region}") + + base_log = "work/{mapper}.cnvkit/log/cnvkit." + base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." + + if action in ("access", "antitarget"): + tpl = base_log + action elif action in ( - "reference", + "autobin", "fix", "segment", "call", "bintest", - "report/metrics", - "report/segmetrics", + "metrics", + "segmetrics", + "genemetrics", ): - prefix = os.path.join(base_name, action.replace("/", "_")) - elif action in ("plot/diagram", "plot/scatter"): - prefix = os.path.join(base_name, action.replace("/", "_") + ".{contig_name}") - elif action == "flat_reference_panel": - prefix = f"work/{{mapper}}.{self.name}/log/reference.{{panel_name}}" - elif action == "flat_reference_wgs": - prefix = f"work/{{mapper}}.{self.name}/log/reference" - return SomaticCnvCallingStepPart._get_log_file_from_prefix(prefix) - - def get_result_files(self, library_name: str, mapper: str) -> typing.List[str]: + tpl = base_log_lib + action + elif action == "target": + if self.compute_avg_target_size and self.paired: + tpl = base_log_lib + "target" + else: + tpl = base_log + "target" + elif action == "reference": + if self.paired: + tpl = base_log_lib + "reference" + else: + tpl = base_log + "reference" + elif action == "coverage": + tpl = base_log_lib + "{region,(target|antitarget)}coverage" + elif action in ("scatter",): + tpl = base_log_lib + action + ".{contig_name}" + else: + raise ValueError(f"Logs of action '{action}' not implemented yet") + + for key, ext in ( + ("conda_list", ".conda_list.txt"), + ("conda_info", ".conda_info.txt"), + ("log", ".log"), + ("sh", ".sh"), + ): + yield key, tpl + ext + yield key + "_md5", tpl + ext + ".md5" + + def get_result_files(self, library_name: str, mapper: str) -> list[str]: """Files to symlink to output""" - base_name = f"{mapper}.{self.name}.{library_name}" - result_files = [] - # Tumor samples - if library_name in self.parent.normal_library: - # Main results - prefix = os.path.join("output", base_name, "out", base_name) - for suffix in ("cnr", "segments.cns", "cns", "bintest.cnr"): - result_files.append(prefix + "." + suffix) - # Log files - prefix = os.path.join("output", base_name, "log") - for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - result_files.append(os.path.join(prefix, f"coverage.target.{ext}")) - result_files.append(os.path.join(prefix, f"coverage.antitarget.{ext}")) - for suffix in ("fix", "segment", "call", "bintest"): - for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - result_files.append(prefix + "/" + suffix + "." + ext) - # Log of reference is no panel of normals - # if not self.config[self.name]["panel_of_normals"]["enabled"]: - # normal_library = self.parent.normal_library[library_name] - # prefix = os.path.join("output", f"{mapper}.{self.name}.{normal_library}", "log", f"{mapper}.{self.name}.{normal_library}.reference") - # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - # result_files.append(prefix + "." + ext) - # Reports - if "reports" in self.config[self.name]: - prefix = os.path.join("output", base_name, "report", base_name) - for report in ("metrics", "segmetrics"): - if report in self.config[self.name]["reports"]: - result_files.append(prefix + "." + report + ".tsv") - # Plots (per chromosome) - if "plots" in self.config[self.name]: - prefix = os.path.join("output", base_name, "plot") - for plot in ("diagram", "scatter"): - if plot in self.config[self.name]["plots"]: - for contig in self.parent.contigs: - result_files.append(os.path.join(prefix, plot, contig + ".png")) - # else: # Normal samples - # prefix = os.path.join("output", base_name, "log", "reference") - # for ext in ("log", "conda_list.txt", "conda_info.txt", "sh"): - # result_files.append(prefix + "." + ext) - return result_files + base_out_lib = ( + "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." + ).format(mapper=mapper, library_name=library_name) - # ----- Access -------------------------------------------------------------------------------- + base_report_lib = ( + "output/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}." + ).format(mapper=mapper, library_name=library_name) - def _get_input_files_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return None + base_plot_lib = ( + "output/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}." + ).format(mapper=mapper, library_name=library_name) - def _get_params_access(self, wildcards: Wildcards) -> typing.Dict[str, str]: - params = {"reference": self.w_config.static_data_config.reference.path} - params["min_gap_size"] = self.config[self.name]["access"]["min_gap_size"] - access = self.config[self.name]["access"].get("exclude", None) - if access: - params["access"] = access + result_files = [] - def _get_output_files_access(self) -> typing.Dict[str, str]: - return {"access": f"work/{self.name}/out/access.bed"} + for suffix in ("cnr", "segments.cns", "cns", "bintest.cns"): + result_files.append(base_out_lib + suffix) - # ----- Target -------------------------------------------------------------------------------- + actions_to_log = ("fix", "segment", "call", "bintest") + for action in actions_to_log: + result_files += [ + path.replace("work", "output", 1).format(mapper=mapper, library_name=library_name) + for path in filter( + lambda p: not p.endswith(".md5"), self.get_log_file(action).values() + ) + ] - def _get_input_files_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: - for panel in self.config.path_target_interval_list_mapping: - if panel.name == wildcards.panel_name: - return {"region": panel.path} + # Logs of metrics not linked + for report in ("metrics", "segmetrics", "genemetrics"): + if self.cfg.get(report).get("enabled"): + result_files.append(base_report_lib + report + ".tsv") - def _get_params_target(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "split": self.config[self.name]["target"]["split"], - "avg_size": self.config[self.name]["target"]["avg_size"], - } + # Logs of plots not links + # TODO: Mouse date: only chromosomes 1 to 19 + chrs = ["all"] + list(map(str, range(1, 23))) + ["X"] + if ( + self.cfg.sample_sex.source != SexOrigin.CONFIG + or self.cfg.sample_sex.default == SexValue.FEMALE + ): + chrs.append("Y") - def _get_output_files_target(self) -> typing.Dict[str, str]: - return {"region": f"work/{self.name}/out/{{panel_name}}_target.bed"} + for plot in ("scatter",): + if self.cfg.get(plot).get("enabled"): + for chr in chrs: + result_files.append(base_plot_lib + f"{plot}.{chr}.jpeg") - # ----- Antitarget ---------------------------------------------------------------------------- + result_files += [x + ".md5" for x in result_files] + return result_files - def _get_input_files_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: - # No antitarget for WGS - return { - "target": f"work/{self.name}/out/{wildcards.panel_name}_target.bed", - "access": f"work/{self.name}/out/access.bed", - } + # ----- Access -------------------------------------------------------------------------------- - def _get_params_antitarget(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "avg_size": self.config[self.name]["antitarget"]["avg_size"], - "min_size": self.config[self.name]["antitarget"]["min_size"], + def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "Should not build access, already available" + return {} + + def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + """ + Arguments used to compute accessible regions for mapping + + When accessible regions are needed to compute average target size + (WGS without average target size set in the config) + then accessible region must cover the full genome (except masked). + Otherwise, access is built with excluded regions. + This happens when the average target size is set in the config in WGS, + or for WES. + """ + assert self.create_access, "Should not build access, already available" + return dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "min-gap-size": self.cfg.access.min_gap_size, + "exclude": self.cfg.access.exclude, } - def _get_output_files_antitarget(self) -> typing.Dict[str, str]: - return {"region": f"work/{self.name}/out/{{panel_name}}_antitarget.bed"} + # ----- Autobin (never used directly, only to compute target size in WGS settings) ------------ - # ----- Coverage ------------------------------------------------------------------------------ + def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: + """ + Input files used to get a good estimate of the average target size - def _get_input_files_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: - # BAM/BAI file + This is only used for WGS data when the average target size isn't set in the config. + The access must be computed over the whole genome (no exclude files) + """ + assert wildcards["library_name"] not in self.tumors, "Autobin always computed on normals" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files = { - "bam": ngs_mapping(base_path + ".bam"), - "bai": ngs_mapping(base_path + ".bam.bai"), - } + tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam".format(**wildcards) + input_files = {"bams": [ngs_mapping(tpl)]} + if self.create_access: + if self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + return input_files - # Region (target or antitarget) file - panel = self.parent.libraryKit.get(wildcards.library_name, None) - if panel is None: - input_files["region"] = f"work/{self.name}/out/access.bed" + def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert ( + self.compute_avg_target_size + ), "Trying to estimate average target size for non-WGS samples" + args = dict(input) | {"bp-per-bin": 50000} + if self.plain_access: + args["method"] = "wgs" else: - input_files["region"] = f"work/{self.name}/out/{panel.name}_{wildcards.region}.bed" - return input_files + args["method"] = "amplicon" + if "target" not in args: + args["target"] = self.cfg.path_access + return args - def _get_params_coverage(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "fasta": self.w_config.static_data_config.reference.path, - "count": self.config[self.name]["coverage"]["count"], - "min_mapq": self.config[self.name]["coverage"]["min_mapq"], - "processes": self.default_resource_usage.threads, - } + # ----- Target -------------------------------------------------------------------------------- - def _get_output_files_coverage(self) -> typing.Dict[str, str]: - return {"coverage": f"work/{{mapper}}.{self.name}.{{library_name}}/out/{{region}}.cnn"} + def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: + """Input files to compute the target regions - # ----- Reference ----------------------------------------------------------------------------- + For WES, no input files, it comes from the baits (in arguments) or + the pon, a previously computed file or the baits (no reference needed) - def _get_input_files_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: - """Builds reference from the paired normal, or flat prior in absence of normal""" + For WGS, target is access, with avg size from the config, or 5000 when + no normal is available (flat prior) or autobin-computed avg size when paired. + In the latter case, the access must be computed from whole genome + (no exclude, no min_avg_size) + """ + assert self.build_ref, "Should not build targets, already available" input_files = {} - normal_library = self.parent.normal_library.get(wildcards.library_name, None) - input_files["normals"] = [ - f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/target.cnn", - f"work/{wildcards.mapper}.{self.name}.{normal_library}/out/antitarget.cnn", - ] + if self.is_wgs: + if self.create_access: + input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" + if self.compute_avg_target_size: + input_files["avg-size"] = self.base_out_lib.format(**wildcards) + "autobin.txt" return input_files - def _get_params_reference(self, wildcards: Wildcards) -> typing.Dict[str, str]: - params = { - "fasta": self.w_config.static_data_config.reference.path, - "cluster": self.config[self.name]["reference"]["cluster"], - "min_cluster_size": self.config[self.name]["reference"]["min_cluster_size"], - "male_reference": self.config[self.name]["use_male_reference"], - "no_gc": self.config[self.name]["reference"]["no_gc"], - "no_edge": self.config[self.name]["reference"]["no_edge"], - "no_rmask": self.config[self.name]["reference"]["no_rmask"], - } - sample_sex = self._get_sample_sex(wildcards.library_name) - if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: - params["sample_sex"] = str(sample_sex) - return + def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.build_ref, "Should not build targets, already available" + if self.is_wes: + args = { + "avg-size": self.cfg.target.avg_size, + "split": self.cfg.target.split, + "interval": self.path_baits, + } + else: + assert self.is_wgs, "Panel not implemented yet" + args = dict(input) | {"split": self.cfg.target.split} + if args.get("avg-size", None) is not None: + args["avg-size"] = self._read_autobin_output(args["avg-size"]) + elif self.cfg.target.avg_size is not None: + args["avg-size"] = self.cfg.target.avg_size + else: + args["avg-size"] = 5000 + if self.w_config.static_data_config.get("features", None): + args["annotate"] = self.w_config.static_data_config.features.path + args["short-names"] = self.cfg.target.short_names + return args - def _get_output_files_reference(self) -> typing.Dict[str, str]: - """TODO: flat prior reference should be library-independent""" - return {"reference": f"work/{{mapper}}.{self.name}.{{library_name}}/out/reference.cnn"} + # ----- Antitarget ---------------------------------------------------------------------------- - def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: - """Builds reference from the paired normal, or flat prior in absence of normal""" - input_files = {} - panel = self.parent.libraryKit.get(wildcards.library_name, None) - if panel is None: # WGS, target is access, no antitarget - input_files["target"] = f"work/{self.name}/out/access.bed" - else: # WES, both target & antitarget - input_files["target"] = f"work/{self.name}/out/{panel.name}_target.bed" - input_files["antitarget"] = f"work/{self.name}/out/{panel.name}_antitarget.bed" + def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: + input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} + if self.create_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" return input_files - def _get_input_files_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_input_files_flat_reference_panel(wildcards) + def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "avg-size": self.cfg.antitarget.avg_size, + "min-size": self.cfg.antitarget.min_size, + } + if "access" not in args: + args["access"] = self.cfg.path_access + return args - def _get_params_flat_reference_panel(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_params_reference(wildcards) + # ----- Coverage ------------------------------------------------------------------------------ - def _get_output_files_flat_reference_panel(self) -> typing.Dict[str, str]: - """TODO: flat prior reference should be library-independent""" - return {"reference": f"work/{{mapper}}.{self.name}/out/reference.{{panel_name}}.cnn"} + def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: + """ + Compute coverage of region (either target or antitarget) + + Except when region provided with file, the region is computed by the pipeline, + and must be inculded with the inputs (possibly from the panel_of_normals step). + For WGS paired, the target regions are sample-dependent, because the optimal + average target size is sample-dependent (via the rough normal sample coverage). + In that case, the target regions must be taken from the normal sample, to + avoid requesting to build targets from the tumor sample. + """ + # BAM/BAI file + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) + input_files = {"bam": ngs_mapping(base_path + ".bam")} + + # Region (target or antitarget) file + if self.build_ref: + if self.compute_avg_target_size: + tpl = self.base_out_lib + "{region}.bed" + if wildcards["library_name"] in self.tumors: + input_files["intervals"] = tpl.format( + mapper=wildcards["mapper"], + library_name=self.parent.matched_normal[wildcards["library_name"]], + region=wildcards["region"], + ) + else: + input_files["intervals"] = tpl.format(**wildcards) + else: + input_files["intervals"] = self.base_out.format(**wildcards) + "{region}.bed" + elif self.pon_source == PanelOfNormalsOrigin.COHORT: + panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] + base_path = "output/{mapper}.cnvkit/out/cnvkit.{region}.bed" + input_files["intervals"] = panel_of_normals(base_path) - def _get_input_files_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_input_files_flat_reference_panel(wildcards) + return input_files - def _get_params_flat_reference_wgs(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return self._get_params_reference(wildcards) + def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "min-mapq": self.cfg.coverage.min_mapq, + "count": self.cfg.coverage.count, + } + if "intervals" not in args: + intervals = self.cfg.panel_of_normals.get("path_{region}".format(**wildcards), "") + assert intervals != "", "Missing path to {region}".format(**wildcards) + args["intervals"] = intervals + return args - def _get_output_files_flat_reference_wgs(self) -> typing.Dict[str, str]: - """TODO: flat prior reference should be library-independent""" - return {"reference": f"work/{{mapper}}.{self.name}/out/reference.cnn"} + # ----- Reference (flat or pairwise) ---------------------------------------------------------- - # ----- Fix ----------------------------------------------------------------------------------- + def _get_input_files_reference(self, wildcards: Wildcards) -> dict[str, str]: + """Builds reference from the paired normal, or flat prior in absence of normal""" + assert self.build_ref, "Should not build reference" + input_files = {} + if self.paired: + input_files["normals"] = [self.base_out_lib.format(**wildcards) + "target.cnn"] + if self.is_wes: + input_files["normals"].append( + self.base_out_lib.format(**wildcards) + "antitarget.cnn" + ) + elif self.pon_source == PanelOfNormalsOrigin.FLAT: + input_files["target"] = self.base_out.format(**wildcards) + "target.bed" + if self.is_wes: + input_files["antitarget"] = self.base_out.format(**wildcards) + "antitarget.bed" + return input_files - def _get_input_files_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: - # Coverage on targets - input_files = { - "target": f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/target.cnn" + def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.build_ref, "Should not build reference" + args = dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "cluster": self.cfg.cluster, + "no-gc": not self.cfg.gc, + "no-rmask": not self.cfg.rmask, + "no-edge": not self.cfg.get("edge", self.is_wes), } - # Coverage on antitargets when present (absent for WGS) - panel = self.parent.libraryKit.get(wildcards.library_name, None) - if panel is not None: # WGS - no antitarget - input_files["antitarget"] = ( - f"work/{wildcards.mapper}.{self.name}.{wildcards.library_name}/out/antitarget.cnn" - ) - # Get reference from panel of normals if available, otherwise from normal or flat when no normal - if not self.config[self.name]["panel_of_normals"]["enabled"]: # Paired normal or flat - normal_library = self.parent.normal_library.get(wildcards.library_name, None) - if normal_library: - input_files["reference"] = ( - f"work/{{mapper}}.{self.name}.{normal_library}/out/reference.cnn" - ) + if self.cfg.cluster: + args["min-cluster-size"] = self.cfg.min_cluster_size + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + sample_sex = self._get_sample_sex(wildcards.get("library_name", None)) + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + if sample_sex == SexValue.MALE and self.paired: + args["male-reference"] = True else: - if panel: - input_files["reference"] = ( - f"work/{{mapper}}.{self.name}/out/reference.{panel.name}.cnn" - ) - else: - input_files["reference"] = f"work/{{mapper}}.{self.name}/out/reference.cnn" - elif ( - self.config[self.name]["panel_of_normals"]["origin"] - == PanelOfNormalsOrigin.PREVIOUS_STEP - ): # Panel_of_normals step - input_files["reference"] = self.parent._get_panel_of_normals_path(self.name, panel) + args["male-reference"] = self.cfg.male_reference else: - input_files["reference"] = self.config[self.name]["panel_of_normals"][ - "path_panel_of_normals" - ] + args["male-reference"] = self.cfg.male_reference + return args + + # ----- Fix ----------------------------------------------------------------------------------- + + def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: + # Coverage on targets & optionally on antitargets + input_files = {"target": self.base_out_lib.format(**wildcards) + "target.cnn"} + if self.is_wes: + input_files["antitarget"] = self.base_out_lib.format(**wildcards) + "antitarget.cnn" + if self.paired: + tpl = "{mapper}.cnvkit.{normal_library}".format( + mapper=wildcards["mapper"], + normal_library=self.parent.matched_normal[wildcards["library_name"]], + ) + input_files["reference"] = os.path.join("work", tpl, "out", tpl + ".reference.cnn") + elif self.pon_source == PanelOfNormalsOrigin.FLAT: + input_files["reference"] = self.base_out.format(**wildcards) + "reference.cnn" + elif self.pon_source == PanelOfNormalsOrigin.COHORT: + panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] + base_path = "output/{mapper}.cnvkit/out/cnvkit.panel_of_normals.cnn" + input_files["reference"] = panel_of_normals(base_path) return input_files - def _get_params_fix(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "sample_id": wildcards.library_name, - "cluster": self.config[self.name]["fix"]["cluster"], - "no_gc": self.config[self.name]["fix"]["no_gc"], - "no_edge": self.config[self.name]["fix"]["no_edge"], - "no_rmask": self.config[self.name]["fix"]["no_rmask"], + def _get_args_fix(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "cluster": self.cfg.cluster, + "no-gc": not self.cfg.gc, + "no-rmask": not self.cfg.rmask, + "no-edge": not self.cfg.get("edge", self.is_wes), } - - def _get_output_files_fix(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"coverage": os.path.join("work", base_name, "out", base_name + ".cnr")} + args["sample-id"] = wildcards.library_name + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + if "reference" not in args: + args["reference"] = self.cfg.panel_of_normals.path_panel_of_normals + return args + + # ----- Variant-related convenience functions ------------------------------------------------- + + def _variants_from_cohort_input(self) -> str: + variants = self.parent.sub_workflows["somatic_variant_calling_cnvkit"] + tpl = f"{{mapper}}.{self.cfg.somatic_variant_calling.tool}.{{library_name}}" + base_path = os.path.join("output", tpl, "out", tpl + ".vcf.gz") + return variants(base_path) + + def _variants_args(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "min-variant-depth": self.cfg.somatic_variant_calling.min_variant_depth, + "sample-id": wildcards.library_name, + "normal-id": self.parent.matched_normal[wildcards.library_name], + } + if self.cfg.somatic_variant_calling.zygocity_freq is not None: + args["zygicity-freq"] = self.cfg.somatic_variant_calling.zygocity_freq + return args # ----- Segment ------------------------------------------------------------------------------- - def _get_input_files_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_input_files_segment(self, wildcards: Wildcards) -> dict[str, str]: # Coverage - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" - input_files = {"coverage": f"work/{base_name}/out/{base_name}.cnr"} - # Segmentation using SNVs if requested and available (normal must be present) - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + input_files = {"ratios": self.base_out_lib.format(**wildcards) + "cnr"} + # Segmentation using SNVs from cohort + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) return input_files - def _get_params_segment(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Segmentation parameters - params = { - "method": self.config[self.name]["segment"]["method"], - "threshold": self.config[self.name]["segment"]["threshold"], - "drop_low_coverage": self.config[self.name]["segment"]["drop_low_coverage"], - "drop_outliers": self.config[self.name]["segment"]["drop_outliers"], - } - if self.config[self.name]["segment"]["method"] == "cbs": - params["smooth_cbs"] = self.config[self.name]["segment"]["smooth_cbs"] - params["processes"] = self.default_resource_usage.threads - # Normal & tumor sample ids if SNVs - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - params["sample_id"] = wildcards.library_name - params["normal_id"] = self.normal_library[wildcards.library_name] - params["min_variant_depth"] = self.config[self.name]["segment"]["min_variant_depth"] - params["zygocity_freq"] = self.config[self.name]["segment"]["zygocity_freq"] - return params - - def _get_output_files_segment(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return { - "segments": os.path.join("work", base_name, "out", base_name + ".segments.cns"), - "dataframe": os.path.join("work", base_name, "out", "dataframe.rds"), + args = dict(input) | { + "method": self.cfg.segment.method, + "threshold": self.cfg.segment.threshold, + "drop-outliers": self.cfg.segment.drop_outliers, + "drop-low-coverage": self.cfg.drop_low_coverage, } + if self.cfg.segment.method == CnvkitSegmentationMethod.CBS: + args["smooth-cbs"] = self.cfg.segment.smooth_cbs + if self.cfg.somatic_variant_calling.enabled: + args |= self._variants_args(wildcards, input) + if "variants" not in args: + args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + return args # ----- Call ---------------------------------------------------------------------------------- - def _get_input_files_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_input_files_call(self, wildcards: Wildcards) -> dict[str, str]: # Segmentation - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" - input_files = {"segments": f"work/{base_name}/out/{base_name}.segments.cns"} - # SNVs if requested and available (normal must be present) - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" - # Purity from the tool if requested and not from the samplesheet + input_files = {"segments": self.base_out_lib.format(**wildcards) + "segments.cns"} + # Segmentation using SNVs from cohort + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + # Purity from the tool if ( - self.config[self.name]["purity"]["enabled"] and self.config[self.name]["purity"]["tool"] - ): # Need purity, and can use tool to obain it - if ( - self.config[self.name]["purity"]["ignore_samplesheet"] - or wildcards.library_name not in self.parent.purity - ): - # Don't use samplesheet - input_files["purity"] = ( - f"work/{base_name}/out/{wildcards.mapper}.{self.config.purity.tool}.txt" - ) + self.cfg.somatic_purity_ploidy_estimate.enabled + and self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.COHORT + ): + purity = self.parent.sub_workflows["somatic_purity_ploidy_estimate_cnvkit"] + tpl = f"{{mapper}}.{self.cfg.somatic_purity_ploidy_estimate.tool}.{{library_name}}" + base_path = os.path.join("output", tpl, "out", tpl + ".txt") + input_files["purity_file"] = purity(base_path).format(**wildcards) return input_files - def _get_params_call(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Call parameters - params = { - "method": self.config[self.name]["call"]["method"], - "thresholds": self.config[self.name]["call"]["thresholds"], - "filter": self.config[self.name]["call"]["filter"], - "drop_low_coverage": self.config[self.name]["call"]["drop_low_coverage"], - "male_reference": self.config[self.name]["use_male_reference"], + args = dict(input) | { + "method": self.cfg.call.method, + "thresholds": self.cfg.call.thresholds, + "drop-low-coverage": self.cfg.drop_low_coverage, + "male-reference": self.cfg.male_reference, } - # If center_at defined, use it, otherwise use the center method - center = self.config[self.name]["call"].get("center_at", None) - if center is not None: - params["center_at"] = center + if self.cfg.call.center_at is not None: + args["center-at"] = self.cfg.call.center_at else: - params["center"] = self.config[self.name]["call"].get("center", "None") - # Normal & tumor sample ids if SNVs - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - params["sample_id"] = wildcards.library_name - params["normal_id"] = self.normal_library[wildcards.library_name] + if self.cfg.call.center is not None: + args["center"] = self.cfg.call.center + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + if self.cfg.somatic_variant_calling.enabled: + args |= self._variants_args(wildcards, input) + if "variants" not in args: + args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling # Sample sex if known, otherwise guessed by the tool sample_sex = self._get_sample_sex(wildcards.library_name) - if sample_sex == Sex.MALE or sample_sex == Sex.FEMALE: - params["sample_sex"] = sample_sex - # If requested, purity from samplesheet or from default if no tool - if self.config[self.name]["purity"]["enabled"]: - purity = self.parent.purity.get( - wildcards.library_name, self.config.purity.default_purity - ) - if purity is not None and not self.config[self.name]["purity"]["ignore_samplesheet"]: - params["purity"] = purity - if self.config.default_ploidy: - params["ploidy"] = self.config.default_ploidy - return params - - def _get_output_files_call(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"calls": os.path.join("work", base_name, "out", base_name + ".cns")} + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + if sample_sex == SexValue.MALE and self.paired: + args["male-reference"] = True + # If requested, purity from samplesheet or from default + if self.cfg.somatic_purity_ploidy_estimate.enabled: + if args.get("purity_file", None) is not None: + (purity, ploidy) = self._read_purity_ploidy_output(args["purity_file"]) + elif self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.SAMPLESHEET: + purity = self.tumors[wildcards.library_name].purity + ploidy = self.tumors[wildcards.library_name].ploidy + elif self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.CONFIG: + purity = self.cfg.purity.purity + ploidy = self.cfg.purity.ploidy + args["purity"] = purity + args["ploidy"] = ploidy + return args # ----- Bintest ------------------------------------------------------------------------------- - def _get_input_files_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_bintest(self, wildcards: Wildcards) -> dict[str, str]: return { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - def _get_params_bintest(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "alpha": self.config[self.name]["bintest"]["alpha"], - "target": self.config[self.name]["bintest"]["target"], + def _get_args_bintest(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | { + "alpha": self.cfg.bintest.alpha, + "target": self.cfg.bintest.target, } - def _get_output_files_bintest(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"coverage": os.path.join("work", base_name, "out", base_name + ".bintest.cnr")} - # ----- Plots -------------------------------------------------------------------------------- - def _get_input_files_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_scatter(self, wildcards: Wildcards) -> dict[str, str]: input_files = { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) return input_files - def _get_params_plot_diagram(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return { - "threshold": self.config[self.name]["plots"]["diagram"]["threshold"], - "min_probes": self.config[self.name]["plots"]["diagram"]["min_probes"], - "no_shift_xy": self.config[self.name]["plots"]["diagram"]["no_shift_xy"], - } - - def _get_output_files_plot_diagram(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"figure": os.path.join("work", base_name, "plot", "diagram", "{contig_name}.pdf")} - - def _get_input_files_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" - input_files = { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "antitarget-marker": self.cfg.scatter.antitarget_marker, + "by-bin": self.cfg.scatter.by_bin, + "segment-color": self.cfg.scatter.segment_color, + "trend": self.cfg.scatter.trend, + "fig-size": self.cfg.scatter.fig_size, + "width": self.cfg.scatter.width, } - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - input_files["variants"] = f"work/{base_name}/out/{base_name}.vcf.gz" - return input_files - - def _get_params_plot_scatter(self, wildcards: Wildcards) -> typing.Dict[str, str]: - params = { - "chromosome": wildcards.contig_name, - "antitarget_marker": self.config[self.name]["plots"]["scatter"]["antitarget_marker"], - "by_bin": self.config[self.name]["plots"]["scatter"]["by_bin"], - "segment_color": self.config[self.name]["plots"]["scatter"]["segment_color"], - "trend": self.config[self.name]["plots"]["scatter"]["trend"], - "y_max": self.config[self.name]["plots"]["scatter"]["y_max"], - "y_min": self.config[self.name]["plots"]["scatter"]["y_min"], - "fig_size": self.config[self.name]["plots"]["scatter"]["fig_size"], - "sample_id": wildcards.library_name, - } - variants = self.config[self.name].get("variants", None) - if variants and wildcards.library_name in self.normal_library: - params["normal_id"] = self.normal_library[wildcards.library_name] - params["min_variant_depth"] = self.config[self.name]["plots"]["scatter"][ - "min_variant_depth" - ] - params["zygocity_freq"] = self.config[self.name]["plots"]["scatter"]["zygocity_freq"] - return params - - def _get_output_files_plot_scatter(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"figure": os.path.join("work", base_name, "plot", "scatter", "{contig_name}.pdf")} + if self.cfg.scatter.y_min is not None: + args["y-min"] = self.cfg.scatter.y_min + if self.cfg.scatter.y_min is not None: + args["y-min"] = self.cfg.scatter.y_min + if wildcards["contig_name"] != "all": + args["chromosome"] = wildcards["contig_name"] + if self.cfg.somatic_variant_calling.enabled: + args |= self._variants_args(wildcards, input) + if "variants" not in args: + args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args["title"] = f"{wildcards['library_name']} - {wildcards['contig_name']}" + return args # ----- Metrics (metrics & segmetrics) -------------------------------------------------------- - def _get_input_files_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_metrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - def _get_params_report_metrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: - return {"drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"]} + def _get_args_metrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | {"drop-low-coverage": self.cfg.drop_low_coverage} - def _get_output_files_report_metrics(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"report": os.path.join("work", base_name, "report", base_name + ".metrics.tsv")} - - def _get_input_files_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: - base_name = f"{wildcards.mapper}.{self.name}.{wildcards.library_name}" + def _get_input_files_segmetrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "coverage": f"work/{base_name}/out/{base_name}.cnr", - "segments": f"work/{base_name}/out/{base_name}.segments.cns", + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + } + + def _get_args_segmetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | { + "drop-low-coverage": self.cfg.drop_low_coverage, + "alpha": self.cfg.segmetrics.alpha, + "bootstrap": self.cfg.segmetrics.bootstrap, + "smooth-bootstrap": self.cfg.segmetrics.smooth_bootstrap, + "stats": self.cfg.segmetrics.stats, } - def _get_params_report_segmetrics(self, wildcards: Wildcards) -> typing.Dict[str, str]: + def _get_input_files_genemetrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "drop_low_coverage": self.config[self.name]["reports"]["drop_low_coverage"], - "stats": ( - "mean", - "median", - "mode", - "t-test", - "stdev", - "sem", - "mad", - "mse", - "iqr", - "bivar", - "ci", - "pi", - ), - "alpha": self.config[self.name]["reports"]["alpha"], - "bootstrap": self.config[self.name]["reports"]["bootstrap"], + "ratios": self.base_out_lib.format(**wildcards) + "cnr", + "segments": self.base_out_lib.format(**wildcards) + "segments.cns", } - def _get_output_files_report_segmetrics(self) -> typing.Dict[str, str]: - base_name = f"{{mapper}}.{self.name}.{{library_name}}" - return {"report": os.path.join("work", base_name, "report", base_name + ".segmetrics.tsv")} + def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "drop-low-coverage": self.cfg.drop_low_coverage, + "male-reference": self.cfg.male_reference, + "threshold": self.cfg.genemetrics.threshold, + "min-probes": self.cfg.genemetrics.min_probes, + "alpha": self.cfg.genemetrics.alpha, + "bootstrap": self.cfg.genemetrics.bootstrap, + "stats": [x.replace("t-test", "ttest") for x in self.cfg.genemetrics.stats], + } + if self.cfg.diploid_parx_genome: + args["diploid-parx-genome"] = self.cfg.diploid_parx_genome + sample_sex = self._get_sample_sex(wildcards.library_name) + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + if sample_sex == SexValue.MALE and self.paired: + args["male-reference"] = True + return args + + # ----- Read small files to put values in parameters + + def _read_autobin_output(self, filename: str) -> int: + nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" + pattern = re.compile("^Target:[ \t]+" + nb + "[ \t]+" + nb + "$") + with open(filename) as f: + for line in f: + m = pattern.match(line) + if m: + return int(float(m.groups()[4])) + return -1 + + def _read_purity_ploidy_output(self, filename: str) -> tuple[float, float]: + # TODO: Tool-dependent parsing of purity/ploidy file + nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" + pattern = re.compile("^Purity/ploidy:[ \t]+" + nb + "[ \t]+" + nb + "$") + with open(filename) as f: + for line in f: + m = pattern.match(line) + if m: + return (float(m.groups()[1]), float(m.groups()[4])) + return (-1.0, -1.0) + + +class LibraryInfo(NamedTuple): + library: NGSLibrary + donor: str + is_tumor: bool + libraryType: str + libraryKit: str | None + sex: Sex | None + purity: float | None + ploidy: float = 2 class SomaticCnvCallingWorkflow(BaseStep): @@ -800,6 +1045,21 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_model_class=SomaticCnvCallingConfigModel, previous_steps=(NgsMappingWorkflow,), ) + # Collect extra information per library + self.valid_dna_libraries = {} + for sheet in self.shortcut_sheets: + self.valid_dna_libraries |= SomaticCnvCallingWorkflow._get_dna_libraries(sheet) + + # All tumor samples, by libraryKit, with None for WGS + self.tumors = SomaticCnvCallingWorkflow._split_by( + SomaticCnvCallingWorkflow._filter_by( + self.valid_dna_libraries.values(), "is_tumor", lambda x: x + ), + "libraryKit", + ) + + self.matched_normal = self._match_normals() + # Register sub step classes so the sub steps are available self.register_sub_step_classes( ( @@ -812,194 +1072,153 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) # Initialize sub-workflows self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) - self.registered_pons = self._optionally_register_pon() - - # Collect extra information per library - self.normal_library = self._get_normal_library() - self.libraryKit = self._get_panel_information() - self.sex = self._get_sex() - self.purity = self._get_purity() + for subworkflow in ( + "panel_of_normals", + "somatic_variant_calling", + "somatic_purity_ploidy_estimate", + ): + self._optionally_register_subworkflow(subworkflow) def get_result_files(self) -> OutputFiles: fns = [] - for seq_type, tools in self.config.tools: - for library in self._get_libraries(): - if library.extra_infos.get("libraryType", "").lower() != seq_type: - continue - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - bio_sample = test_sample.bio_sample - is_tumor = bio_sample.extra_infos.get("isTumor", True) - if is_tumor: - for tool in tools: - f = self.substep_getattr(tool, "get_result_files") - for mapper in self.w_config.step_config["ngs_mapping"]["tools"]["dna"]: - for fn in f(library.name, mapper): - fns.append(fn) + + for tool in self.config.tools.wgs: + for mapper in self.w_config.step_config["ngs_mapping"].tools.dna: + for library in self.tumors.get(None): + fns += self.sub_steps.get(tool).get_result_files(library.library.name, mapper) + + for tool in self.config.tools.wes: + for mapper in self.w_config.step_config["ngs_mapping"].tools.dna: + for libraryKit in self.tumors.keys(): + if libraryKit is None: + continue + for library in self.tumors.get(libraryKit): + fns += self.sub_steps.get(tool).get_result_files( + library.library.name, mapper + ) + return OutputFiles(fns) - def _get_libraries(self) -> typing.Iterator[NGSLibrary]: - for sheet in self.shortcut_sheets: - for donor in sheet.sheet.bio_entities.values(): - for bio_sample in donor.bio_samples.values(): - for test_sample in bio_sample.test_samples.values(): - for library in test_sample.ngs_libraries.values(): - yield library - - def _get_normal_library(self) -> typing.Dict[str, str]: - normal_for_donor = {} - for library in self._get_libraries(): - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - bio_sample = test_sample.bio_sample - is_tumor = bio_sample.extra_infos.get("isTumor", None) - if is_tumor is None: - raise ValueError(f"Missing 'isTumor' value for library '{library.name}'") - if is_tumor: - continue - donor = bio_sample.bio_entity - if donor.name in normal_for_donor: - raise ValueError(f"Multiple normals for donor '{donor.name}'") - normal_for_donor[donor.name] = library.name - - normal_library = {} - for library in self._get_libraries(): - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - bio_sample = test_sample.bio_sample - donor = bio_sample.bio_entity - if bio_sample.extra_infos.get("isTumor", True): - normal_library[library.name] = normal_for_donor[donor.name] - return normal_library - - def _optionally_register_pon(self) -> typing.Dict[str, str]: - """ - Register all possible combination of panel of normals: - - WGS PON for all configured WGS tools which require/can use it - - WES PON for all configured WES tools which require/can use it, one for each enrichment kit + def _match_normals(self): + normals = SomaticCnvCallingWorkflow._split_by( + SomaticCnvCallingWorkflow._filter_by( + self.valid_dna_libraries.values(), "is_tumor", lambda x: not x + ), + "libraryKit", + ) - Note that there is no need to specify the genome release, - because the panel_of_normals step used here MUST be in the same project, - so it has the same configuration, and only one genome release is allowed per configuration. - """ - registered_pons = list() - for tool in self.config.tools.wgs: - pon_name = f"wgs.{tool}" - if pon_name in registered_pons: - continue - if self.config[tool].get("panel_of_normals", None) and self.config[ - tool - ].panel_of_normals.get("path_panel_of_normals_step", None): + # Pairing between tumor & normals (must share the same libraryKit) + matched_normal = { + sample.library.name: None for samples in self.tumors.values() for sample in samples + } + for libraryKit, samples in self.tumors.items(): + if libraryKit in normals: + normals_by_donor = SomaticCnvCallingWorkflow._split_by(normals[libraryKit], "donor") + for sample in samples: + donor = sample.donor + normal = normals_by_donor.get(donor, []) + assert ( + len(normal) < 2 + ), f"Muliple valid donor samples for tumor library {sample.library.name}" + if normal: + normal_library = normal[0].library + matched_normal[sample.library.name] = normal_library.name + return matched_normal + + def _optionally_register_subworkflow(self, subworkflow): + for tool in set(self.config.tools.wgs + self.config.tools.wes): + assert self.config.get(tool) is not None, f"Requested tool '{tool}' not configured" + cfg = self.config.get(tool) + subworkflow_config = cfg.get(subworkflow) + if ( + subworkflow_config + and subworkflow_config.enabled + and str(subworkflow_config.source) == "cohort" + ): self.register_sub_workflow( - "panel_of_normals", - self.config[tool].panel_of_normals.path_panel_of_normals_step, - pon_name, + subworkflow, + subworkflow_config.get(f"path_{subworkflow}"), + f"{subworkflow}_{tool}", ) - registered_pons.append(pon_name) - for tool in self.config.tools.wes: - for panel in self.config.path_target_interval_list_mapping: - pon_name = f"wes.{tool}.{panel.name}" - if pon_name in registered_pons: - continue - if self.config[tool].get("panel_of_normals", None) and self.config[ - tool - ].panel_of_normals.get("path_panel_of_normals_step", None): - self.register_sub_workflow( - "panel_of_normals", - self.config[tool].panel_of_normals.path_panel_of_normals_step, - pon_name, - ) - registered_pons.append(pon_name) - return registered_pons - def _get_panel_information(self) -> typing.Dict[str, str]: - # Set default panel - default = None - for panel in self.config.path_target_interval_list_mapping: - if panel.name == "__default__": - default = panel - break - - # Extract library pattern (the "libraryKit" column in samplesheet) - # On output: - # - the panel name and panel path if libraryKit is present & known - # - the default panel path if libraryKit is undefined or not found - # - None for WGS - # - ValueError if libraryType is missing or unknown (not WES nor WGS) - libraryKit = {} - for library in self._get_libraries(): - test_sample = library.test_sample - if test_sample.extra_infos.get("extractionType", "") != "DNA": - continue - - libraryType = library.extra_infos.get("libraryType", None) - if libraryType is None: - raise ValueError(f"Missing library type for library '{library.name}'") - elif libraryType == "WES": - if library.extra_infos.get("libraryKit", None): - for panel in self.config.path_target_interval_list_mapping: - if re.match(panel.pattern, library.extra_infos.get("libraryKit")): - libraryKit[library.name] = panel - break - if library.name not in libraryKit: - libraryKit[library.name] = default + @staticmethod + def _get_dna_libraries(sheet) -> dict[str, LibraryInfo]: + allowed_library_types = [ + k for k, v in LIBRARY_TO_EXTRACTION.items() if v == EXTRACTION_TYPE_DNA + ] + + valid_dna_libraries = {} + for donor in sheet.sheet.bio_entities.values(): + sex: SexValue = donor.extra_infos.get("sex", None) + for bio_sample in donor.bio_samples.values(): + is_tumor = bio_sample.extra_infos.get("isTumor", None) + assert ( + is_tumor is not None + ), f"Missing 'isTumor' value for sample '{donor.name}-{bio_sample.name}'" + if is_tumor: + purity = bio_sample.extra_infos.get("purity", None) + ploidy = bio_sample.extra_infos.get("ploidy", 2) else: - libraryKit[library.name] = default - if libraryKit[library.name] is None: - raise ValueError(f"Undefined panel for library '{library.name}") - elif libraryType == "WGS": - libraryKit[library.name] = None - else: - raise ValueError( - f"Unknown library type '{libraryType}' for library '{library.name}'" - ) + purity = None + ploidy = 2 + for test_sample in bio_sample.test_samples.values(): + if ( + test_sample.extra_infos.get("extractionType", "").upper() + != EXTRACTION_TYPE_DNA + ): + continue + for library in test_sample.ngs_libraries.values(): + assert ( + library.name not in valid_dna_libraries + ), f"Duplicate entry for library {library.name}" + libraryType = library.extra_infos.get("libraryType", None) + assert ( + libraryType is not None + ), f"Missing library type for library '{library.name}'" + if libraryType.upper() not in allowed_library_types: + continue + libraryKit = None + if libraryType.upper() == "WES" or libraryType.upper() == "Panel": + libraryKit = library.extra_infos.get("libraryKit", None) + assert ( + libraryKit is not None + ), f"Missing library kit for library '{library.name}'" + valid_dna_libraries[library.name] = LibraryInfo( + library, + donor.name, + is_tumor, + libraryType, + libraryKit, + purity, + ploidy, + sex, + ) + + return valid_dna_libraries - return libraryKit - - def _get_purity(self) -> typing.Dict[str, str]: - """Returns the purity value from the 'purity' library extra_infos. Missing otherwise""" - purity = {} - for library in self._get_libraries(): - p = library.extra_infos.get("purity", None) - if p: - try: - p = float(p) - if 0 <= p and p <= 1: - purity[library.name] = p - except: - pass - return purity - - def _get_sex(self) -> typing.Dict[str, Sex]: - sex = {} - for library in self._get_libraries(): - donor = library.test_sample.bio_sample.bio_entity - donor_sex = donor.extra_infos.get("sex", None) - if donor_sex == "male": - donor_sex = Sex.MALE - elif donor_sex == "female": - donor_sex = Sex.FEMALE - else: - donor_sex = Sex.UNKNOWN - sex[library.name] = donor_sex - return sex - - def _get_panel_of_normals_path(self, tool: str, panel: LibraryKitDefinition | None) -> str: - pon_path = None - assert self.config[tool]["panel_of_normals"][ - "enabled" - ], f"Panel of normals not enabled for '{tool}'" - assert ( - self.config[tool]["panel_of_normals"]["origin"] == PanelOfNormalsOrigin.PREVIOUS_STEP - ), f"'{tool}' panel of normals not from previous step" - if panel is None: - pon_id = f"wgs.{tool}" - else: - pon_id = f"wes.{tool}.{panel.name}" - assert pon_id in self.registered_pons, f"Requested panel '{pon_id}' not registered" - pon = self.parent.sub_workflows[pon_id] - pon_path = pon(f"output/{{mapper}}.{tool}/out/{panel.name}.ext") - return pon_path + @staticmethod + def _split_by( + valid_dna_libraries: list[LibraryInfo], i: str = "library" + ) -> dict[Any, list[LibraryInfo]]: + split = {} + for entry in valid_dna_libraries: + index = getattr(entry, i) + if isinstance(index, (int, float, complex, bool)): + index = str(index) + if index not in split: + split[index] = [] + split[index].append(entry) + return split + + @staticmethod + def _filter_by( + valid_dna_libraries: list[LibraryInfo], + i: str = "library", + f: Callable[[Any], bool] = lambda x: True, + ) -> list[LibraryInfo]: + filtered = [] + for entry in valid_dna_libraries: + index = getattr(entry, i) + if f(index): + filtered.append(entry) + return filtered diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules index 8ec5d13fe..cfaf75206 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules +++ b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules @@ -1,8 +1,8 @@ -rule somatic_targeted_seq_cnv_calling_cnvkit_access: +rule somatic_cnv_calling_cnvkit_access: output: **wf.get_output_files("cnvkit", "access"), params: - wf.get_params("cnvkit", "access"), + **{"args": wf.get_args("cnvkit", "access")}, log: **wf.get_log_file("cnvkit", "access"), threads: wf.get_resource("cnvkit", "access", "threads") @@ -15,11 +15,30 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_access: wf.wrapper_path("cnvkit/access") -rule somatic_targeted_seq_cnv_calling_cnvkit_target: +rule somatic_cnv_calling_cnvkit_autobin: + input: + unpack(wf.get_input_files("cnvkit", "autobin")), + output: + **wf.get_output_files("cnvkit", "autobin"), + params: + **{"args": wf.get_args("cnvkit", "autobin")}, + log: + **wf.get_log_file("cnvkit", "autobin"), + threads: wf.get_resource("cnvkit", "autobin", "threads") + resources: + time=wf.get_resource("cnvkit", "autobin", "time"), + memory=wf.get_resource("cnvkit", "autobin", "memory"), + partition=wf.get_resource("cnvkit", "autobin", "partition"), + tmpdir=wf.get_resource("cnvkit", "autobin", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/autobin") + + +rule somatic_cnv_calling_cnvkit_target: input: unpack(wf.get_input_files("cnvkit", "target")), params: - wf.get_params("cnvkit", "target"), + **{"args": wf.get_args("cnvkit", "target")}, output: **wf.get_output_files("cnvkit", "target"), log: @@ -34,11 +53,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_target: wf.wrapper_path("cnvkit/target") -rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: +rule somatic_cnv_calling_cnvkit_antitarget: input: unpack(wf.get_input_files("cnvkit", "antitarget")), params: - wf.get_params("cnvkit", "antitarget"), + **{"args": wf.get_args("cnvkit", "antitarget")}, output: **wf.get_output_files("cnvkit", "antitarget"), log: @@ -53,11 +72,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: wf.wrapper_path("cnvkit/antitarget") -rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: +rule somatic_cnv_calling_cnvkit_coverage: input: unpack(wf.get_input_files("cnvkit", "coverage")), params: - wf.get_params("cnvkit", "coverage"), + **{"args": wf.get_args("cnvkit", "coverage")}, output: **wf.get_output_files("cnvkit", "coverage"), log: @@ -72,11 +91,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: wf.wrapper_path("cnvkit/coverage") -rule somatic_targeted_seq_cnv_calling_cnvkit_reference: +rule somatic_cnv_calling_cnvkit_reference: input: unpack(wf.get_input_files("cnvkit", "reference")), params: - wf.get_params("cnvkit", "reference"), + **{"args": wf.get_args("cnvkit", "reference")}, output: **wf.get_output_files("cnvkit", "reference"), log: @@ -91,49 +110,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_reference: wf.wrapper_path("cnvkit/reference") -# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_panel: -# input: -# unpack(wf.get_input_files("cnvkit", "flat_reference_panel")), -# params: -# wf.get_params("cnvkit", "reference"), -# output: -# **wf.get_output_files("cnvkit", "flat_reference_panel"), -# log: -# **wf.get_log_file("cnvkit", "reference"), -# threads: wf.get_resource("cnvkit", "reference", "threads") -# resources: -# time=wf.get_resource("cnvkit", "reference", "time"), -# memory=wf.get_resource("cnvkit", "reference", "memory"), -# partition=wf.get_resource("cnvkit", "reference", "partition"), -# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), -# wrapper: -# wf.wrapper_path("cnvkit/reference") - - -# rule somatic_targeted_seq_cnv_calling_cnvkit_flat_reference_wgs: -# input: -# unpack(wf.get_input_files("cnvkit", "flat_reference_wgs")), -# params: -# wf.get_params("cnvkit", "reference"), -# output: -# **wf.get_output_files("cnvkit", "flat_reference_wgs"), -# log: -# **wf.get_log_file("cnvkit", "reference"), -# threads: wf.get_resource("cnvkit", "reference", "threads") -# resources: -# time=wf.get_resource("cnvkit", "reference", "time"), -# memory=wf.get_resource("cnvkit", "reference", "memory"), -# partition=wf.get_resource("cnvkit", "reference", "partition"), -# tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), -# wrapper: -# wf.wrapper_path("cnvkit/reference") - - -rule somatic_targeted_seq_cnv_calling_cnvkit_fix: +rule somatic_cnv_calling_cnvkit_fix: input: unpack(wf.get_input_files("cnvkit", "fix")), params: - wf.get_params("cnvkit", "fix"), + **{"args": wf.get_args("cnvkit", "fix")}, output: **wf.get_output_files("cnvkit", "fix"), log: @@ -148,11 +129,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_fix: wf.wrapper_path("cnvkit/fix") -rule somatic_targeted_seq_cnv_calling_cnvkit_segment: +rule somatic_cnv_calling_cnvkit_segment: input: unpack(wf.get_input_files("cnvkit", "segment")), params: - wf.get_params("cnvkit", "segment"), + **{"args": wf.get_args("cnvkit", "segment")}, output: **wf.get_output_files("cnvkit", "segment"), log: @@ -167,11 +148,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_segment: wf.wrapper_path("cnvkit/segment") -rule somatic_targeted_seq_cnv_calling_cnvkit_call: +rule somatic_cnv_calling_cnvkit_call: input: unpack(wf.get_input_files("cnvkit", "call")), params: - wf.get_params("cnvkit", "call"), + **{"args": wf.get_args("cnvkit", "call")}, output: **wf.get_output_files("cnvkit", "call"), log: @@ -186,11 +167,13 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_call: wf.wrapper_path("cnvkit/call") -rule somatic_targeted_seq_cnv_calling_cnvkit_bintest: +rule somatic_cnv_calling_cnvkit_bintest: + input: + unpack(wf.get_input_files("cnvkit", "bintest")), output: **wf.get_output_files("cnvkit", "bintest"), params: - wf.get_params("cnvkit", "bintest"), + **{"args": wf.get_args("cnvkit", "bintest")}, log: **wf.get_log_file("cnvkit", "bintest"), threads: wf.get_resource("cnvkit", "bintest", "threads") @@ -203,77 +186,96 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_bintest: wf.wrapper_path("cnvkit/bintest") -rule somatic_targeted_seq_cnv_calling_cnvkit_plot_diagram: +# rule somatic_cnv_calling_cnvkit_plot_diagram: +# input: +# unpack(wf.get_input_files("cnvkit", "plot/diagram")), +# params: +# **{"args": wf.get_args("cnvkit", "plot/diagram")}, +# output: +# **wf.get_output_files("cnvkit", "plot/diagram"), +# log: +# **wf.get_log_file("cnvkit", "plot/diagram"), +# threads: wf.get_resource("cnvkit", "plot/diagram", "threads") +# resources: +# time=wf.get_resource("cnvkit", "plot/diagram", "time"), +# memory=wf.get_resource("cnvkit", "plot/diagram", "memory"), +# partition=wf.get_resource("cnvkit", "plot/diagram", "partition"), +# tmpdir=wf.get_resource("cnvkit", "plot/diagram", "tmpdir"), +# wrapper: +# wf.wrapper_path("cnvkit/plot/diagram") +# +# +rule somatic_seq_cnv_calling_cnvkit_plot_scatter: input: - unpack(wf.get_input_files("cnvkit", "plot/diagram")), + unpack(wf.get_input_files("cnvkit", "scatter")), params: - wf.get_params("cnvkit", "plot/diagram"), + **{"args": wf.get_args("cnvkit", "scatter")}, output: - **wf.get_output_files("cnvkit", "plot/diagram"), + **wf.get_output_files("cnvkit", "scatter"), log: - **wf.get_log_file("cnvkit", "plot/diagram"), - threads: wf.get_resource("cnvkit", "plot/diagram", "threads") + **wf.get_log_file("cnvkit", "scatter"), + threads: wf.get_resource("cnvkit", "scatter", "threads") resources: - time=wf.get_resource("cnvkit", "plot/diagram", "time"), - memory=wf.get_resource("cnvkit", "plot/diagram", "memory"), - partition=wf.get_resource("cnvkit", "plot/diagram", "partition"), - tmpdir=wf.get_resource("cnvkit", "plot/diagram", "tmpdir"), + time=wf.get_resource("cnvkit", "scatter", "time"), + memory=wf.get_resource("cnvkit", "scatter", "memory"), + partition=wf.get_resource("cnvkit", "scatter", "partition"), + tmpdir=wf.get_resource("cnvkit", "scatter", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/plot/diagram") + wf.wrapper_path("cnvkit/plot/scatter") -rule somatic_targeted_seq_cnv_calling_cnvkit_plot_scatter: +rule somatic_cnv_calling_cnvkit_report_metrics: input: - unpack(wf.get_input_files("cnvkit", "plot/scatter")), + unpack(wf.get_input_files("cnvkit", "metrics")), params: - wf.get_params("cnvkit", "plot/scatter"), + **{"args": wf.get_args("cnvkit", "metrics")}, output: - **wf.get_output_files("cnvkit", "plot/scatter"), + **wf.get_output_files("cnvkit", "metrics"), log: - **wf.get_log_file("cnvkit", "plot/scatter"), - threads: wf.get_resource("cnvkit", "plot/scatter", "threads") + **wf.get_log_file("cnvkit", "metrics"), + threads: wf.get_resource("cnvkit", "metrics", "threads") resources: - time=wf.get_resource("cnvkit", "plot/scatter", "time"), - memory=wf.get_resource("cnvkit", "plot/scatter", "memory"), - partition=wf.get_resource("cnvkit", "plot/scatter", "partition"), - tmpdir=wf.get_resource("cnvkit", "plot/scatter", "tmpdir"), + time=wf.get_resource("cnvkit", "metrics", "time"), + memory=wf.get_resource("cnvkit", "metrics", "memory"), + partition=wf.get_resource("cnvkit", "metrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "metrics", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/plot/scatter") + wf.wrapper_path("cnvkit/report/metrics") -rule somatic_targeted_seq_cnv_calling_cnvkit_report_metrics: +rule somatic_cnv_calling_cnvkit_report_segmetrics: input: - unpack(wf.get_input_files("cnvkit", "report/metrics")), + unpack(wf.get_input_files("cnvkit", "segmetrics")), params: - wf.get_params("cnvkit", "report/metrics"), + **{"args": wf.get_args("cnvkit", "segmetrics")}, output: - **wf.get_output_files("cnvkit", "report/metrics"), + **wf.get_output_files("cnvkit", "segmetrics"), log: - **wf.get_log_file("cnvkit", "report/metrics"), - threads: wf.get_resource("cnvkit", "report/metrics", "threads") + **wf.get_log_file("cnvkit", "segmetrics"), + threads: wf.get_resource("cnvkit", "segmetrics", "threads") resources: - time=wf.get_resource("cnvkit", "report/metrics", "time"), - memory=wf.get_resource("cnvkit", "report/metrics", "memory"), - partition=wf.get_resource("cnvkit", "report/metrics", "partition"), - tmpdir=wf.get_resource("cnvkit", "report/metrics", "tmpdir"), + time=wf.get_resource("cnvkit", "segmetrics", "time"), + memory=wf.get_resource("cnvkit", "segmetrics", "memory"), + partition=wf.get_resource("cnvkit", "segmetrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "segmetrics", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/report/metrics") + wf.wrapper_path("cnvkit/report/segmetrics") -rule somatic_targeted_seq_cnv_calling_cnvkit_report_segmetrics: +rule somatic_cnv_calling_cnvkit_report_genemetrics: input: - unpack(wf.get_input_files("cnvkit", "report/segmetrics")), + unpack(wf.get_input_files("cnvkit", "genemetrics")), params: - wf.get_params("cnvkit", "report/segmetrics"), + **{"args": wf.get_args("cnvkit", "genemetrics")}, output: - **wf.get_output_files("cnvkit", "report/segmetrics"), + **wf.get_output_files("cnvkit", "genemetrics"), log: - **wf.get_log_file("cnvkit", "report/segmetrics"), - threads: wf.get_resource("cnvkit", "report/segmetrics", "threads") + **wf.get_log_file("cnvkit", "genemetrics"), + threads: wf.get_resource("cnvkit", "genemetrics", "threads") resources: - time=wf.get_resource("cnvkit", "report/segmetrics", "time"), - memory=wf.get_resource("cnvkit", "report/segmetrics", "memory"), - partition=wf.get_resource("cnvkit", "report/segmetrics", "partition"), - tmpdir=wf.get_resource("cnvkit", "report/segmetrics", "tmpdir"), + time=wf.get_resource("cnvkit", "genemetrics", "time"), + memory=wf.get_resource("cnvkit", "genemetrics", "memory"), + partition=wf.get_resource("cnvkit", "genemetrics", "partition"), + tmpdir=wf.get_resource("cnvkit", "genemetrics", "tmpdir"), wrapper: - wf.wrapper_path("cnvkit/report/segmetrics") + wf.wrapper_path("cnvkit/report/genemetrics") diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py index ef10a9983..f86472ba4 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -1,10 +1,11 @@ import enum import typing from typing import Annotated - from pydantic import Field, model_validator # , validator -from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, Parallel +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel +from snappy_pipeline.models.cnvkit import Cnvkit as CnvkitGeneric +from snappy_pipeline.models.library_kit import LibraryKitEntry class WgsCaller(enum.StrEnum): @@ -19,117 +20,86 @@ class WesCaller(enum.StrEnum): class Tools(SnappyModel): - wgs: Annotated[typing.List[WgsCaller], EnumField(WgsCaller, [])] + wgs: Annotated[list[WgsCaller], EnumField(WgsCaller, [])] """WGS calling tools""" - wes: Annotated[typing.List[WesCaller], EnumField(WesCaller, [])] + wes: Annotated[list[WesCaller], EnumField(WesCaller, [])] """WES calling tools""" -class Sex(enum.StrEnum): - SAMPLESHEET = "samplesheet" - """Obtain the sex from the samplesheet""" - DIPLOID_ONLY = "diploid_only" - """Compute CNV for diploid chromosomes only""" - AUTO = "auto" - """Automatic sex detection using X/Y coverage""" - FEMALE = "female" - """Assume all samples are female""" - MALE = "male" - """Assume all samples are male""" - UNKNOWN = "unknown" - """Sex is unknown""" - - class SequencingMethod(enum.StrEnum): WES = "hybrid" PANEL = "amplicon" WGS = "wgs" -class LibraryKitDefinition(SnappyModel): - """ - Mapping from enrichment kit to target region BED file, for either computing per--target - region coverage or selecting targeted exons. - - The following will match both the stock IDT library kit and the ones - with spike-ins seen fromr Yale genomics. The path above would be - mapped to the name "default". - - name: IDT_xGen_V1_0 - pattern: "xGen Exome Research Panel V1\\.0*" - path: "path/to/targets.bed" - """ +class SexValue(enum.StrEnum): + MALE = "male" + FEMALE = "female" - name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] - pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] +class SexOrigin(enum.StrEnum): + AUTOMATIC = "auto" + SAMPLESHEET = "samplesheet" + CONFIG = "config" - path: Annotated[str, Field(examples=["path/to/targets.bed"])] + +class Sex(SnappyModel): + source: SexOrigin = SexOrigin.AUTOMATIC + default: SexValue | None = None + + @model_validator(mode="after") + def ensure_default_value(self): + if self.source == SexOrigin.CONFIG and not self.default: + raise ValueError("Undefined default sex value in configuration file") + return self class PanelOfNormalsOrigin(enum.StrEnum): - PREVIOUS_STEP = "previous_step" + COHORT = "cohort" """Use (& possibly create) a panel of normals from the current cohort of normals in the panel_of_normals step""" - STATIC = "static" + + FILE = "file" """Use an panel of normals from another cohort or from public data""" + FLAT = "flat" + """Use a flat panel of normal (no panel of normals, actually)""" + class PanelOfNormals(SnappyModel): enabled: bool = False - origin: PanelOfNormalsOrigin = PanelOfNormalsOrigin.PREVIOUS_STEP - path_panel_of_normals: str = "../panel_of_normals" - """ - Path to panel of normals created in current project - - The panel of normals can be either a file (typically from another project), - or from the current project's panel_of_normals step. + """Use panel of normals during CNV calling""" - In the latter case, the missing one(s) (in case there are more than one panel, or if there are WES & WGS) - will be created when not present. - The matching of genome release & exome baits is done on genome name & exome baits md5 checksum. - These are computed in the panel of normals step, and saved with the panel itself. + source: PanelOfNormalsOrigin = PanelOfNormalsOrigin.FILE + """Which type of panel of normals should be used""" - There is no such matching if a panel of normal file is provided. The panel of normals validity is left to the user. + path_panel_of_normals: str = "" """ + Path to panel of normals. + The panel of normals can be either a file (typically from another project, or from the software's own data), + or the path to the pipeline's ```panel_of _normals``` step, depending on the choice of source. -class Mutect2(Parallel): - panel_of_normals: PanelOfNormals | None = None + Note that there is no test that the panel of normals is suitable for that cohort. """ - Panel of normals created by the PanelOfNormals program. - """ - - germline_resource: str - common_variants: str | None = "" - """Common germline variants for contamination estimation""" + @model_validator(mode="after") + def ensure_panel_of_normals_path(self): + if ( + self.enabled + and self.source != PanelOfNormalsOrigin.FLAT + and not self.path_panel_of_normals + ): + raise ValueError("Undefined panel of normal path") + return self - arguments_for_purecn: bool = True - """ - PureCN requires that Mutect2 be called with arguments: - --genotype-germline-sites true --genotype-pon-sites true - """ - extra_arguments: Annotated[ - typing.List[str], - # AfterValidator(argument), - Field( - examples=[ - "--read-filter CigarContainsNoNOperator", - "--annotation AssemblyComplexity BaseQuality", - ] - ), - ] = [] - """ - List additional Mutect2 arguments. - Each additional argument must be of the form: - "-- " - For example, to filter reads prior to calling & to add annotations to the output vcf: - - "--read-filter CigarContainsNoNOperator" - - "--annotation AssemblyComplexity BaseQuality" - """ +class VariantOrigin(enum.StrEnum): + COHORT = "cohort" + """Call somatic variants from the current cohort of normals in the somatic_variant_calling step""" - window_length: int = 300000000 + FILE = "file" + """Use an panel of normals from another cohort or from public data""" class VariantTool(enum.StrEnum): @@ -138,14 +108,33 @@ class VariantTool(enum.StrEnum): class Variant(SnappyModel): enabled: bool = False - tool: VariantTool | None = None + """Use variants (somatic &/or germline) to improve CNV calling""" - mutect2: Mutect2 | None = None + source: VariantOrigin = VariantOrigin.FILE + """Where are the variants obrained from""" + path_somatic_variant_calling: str = "" + """ + Path to the variants to use for CNV calling. -class Ascat(SnappyModel): - pass - """TODO: configure purity tools (except for PureCN)""" + The path can be either to the ```somatic_variant_calling``` step in the pipeline, if "cohort" is selected, + or to the vcf file with the variants when "file" is selected as source. + """ + + tool: VariantTool = VariantTool.MUTECT2 + """Tool used to call somatic variants in the pipeline""" + + @model_validator(mode="after") + def ensure_path_to_variants(self): + if ( + self.enabled + and self.source == VariantOrigin.FILE + and not self.path_somatic_variant_calling + ): + raise ValueError( + "A path to the variant vcf file must be provided when selecting 'file' as source" + ) + return self class Sequenza(SnappyModel): @@ -157,13 +146,24 @@ class ControlFreec(SnappyModel): class PureCn(SnappyModel): - panel_of_normals: PanelOfNormals + panel_of_normals: PanelOfNormals = PanelOfNormals() """ Panel of normals created by the NormalDB.R script. This is required even if the normal/tumor paired mode won't use it. """ - variants: VariantTool + @model_validator(mode="after") + def restrict_pon_mode(self) -> typing.Self: + if not self.panel_of_normals.enabled: + raise ValueError("PureCN requires a panel of normals") + return self + + path_target_interval_list_mapping: list[LibraryKitEntry] = [] + """ + Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect) + """ + + somatic_variant_calling: Variant = Variant() mappability: str = "" """ @@ -202,307 +202,98 @@ class PurityTool(enum.StrEnum): PURECN = "purecn" -class Purity(SnappyModel): - enabled: bool = False +class PurityOrigin(enum.StrEnum): + AUTOMATIC = "auto" + """Use current tool to compute purity & ploidy (PureCn & squenza estimate purity & ploidy)""" - ignore_samplesheet: bool = False - """Discard purity values in samplesheet when they exist""" - default_value: float | None = None - """Purity value for all samples""" + COHORT = "cohort" + """Use external tool from the pipleine to compute purity & ploidy""" - tool: PurityTool | None = None - """Tool used for purity estimation, if not set, try samplesheet, otherwise default_value""" - - ascat: Ascat | None = None + SAMPLESHEET = "samplesheet" + """Extract purity/ploidy from sample sheet""" + CONFIG = "config" + """Extract purity/ploidy from configuration file (all samples have the same value)""" -class CnvkitSegmentationMethod(enum.StrEnum): - CBS = "cbs" - FLASSO = "flasso" - HAAR = "haar" - HMM = "hmm" - HMM_TUMOR = "hmm-tumor" - HMM_GERMLINE = "hmm-germline" - NONE = "none" +class Purity(SnappyModel): + enabled: bool = False + """Use sample purity during CNV calling""" -class CnvkitCallingMethod(enum.StrEnum): - THRESHOLD = "threshold" - CLONAL = "clonal" - NONE = "none" + source: PurityOrigin = PurityOrigin.SAMPLESHEET + path_somatic_purity_ploidy_estimate: str = "../somatic_purity_ploidy_estimate" -class CnvkitCenterMethod(enum.StrEnum): - MEAN = "mean" - MEDIAN = "median" - MODE = "mode" - BIWEIGHT = "biweight" + tool: PurityTool = PurityTool.PURECN + """Tool used for purity estimation, if not set, try samplesheet, otherwise default_value""" + purity: float | None = None + """Default purity estimate""" + ploidy: float = 2.0 + """Default ploidy value""" -class CnvkitFilterMethod(enum.StrEnum): - AMPDEL = "ampdel" - CN = "cn" - CI = "ci" - SEM = "sem" + @model_validator(mode="after") + def ensure_valid_params_for_source(self): + if self.enabled and self.source == PurityOrigin.CONFIG and self.purity is None: + raise ValueError("Missing default purity value") + return self -class CnvkitAccess(SnappyModel): - exclude: Annotated[ - str | None, - Field( - examples=[ - "/fast/work/groups/cubi/projects/biotools/static_data/app_support/cnvkit/access-5k-mappable.grch37.bed" - ] - ), - ] = None - """Regions accessible to mapping""" - - min_gap_size: int = 5000 - """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" - - -class CnvkitTarget(SnappyModel): - split: bool = False - """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float = 800 / 3 - """Average size of split target bins (results are approximate)""" - - -class CnvkitAntitarget(SnappyModel): - avg_size: float = 150000 - """Average size of split antitarget bins (results are approximate)""" - min_size: float | None = None - """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" - - -class CnvkitCoverage(SnappyModel): - count: bool = False - """Get read depths by counting read midpoints within each bin.""" - min_mapq: int = 0 - """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" - - -class CnvkitReference(SnappyModel): - cluster: bool = False - """Calculate and store summary stats for clustered subsets of the normal samples with similar coverage profiles.""" - min_cluster_size: int = 4 - """Minimum cluster size to keep in reference profiles.""" - no_gc: bool = False - """Skip GC correction.""" - no_edge: bool = None - """Skip edge correction. Automatic selection when None (True for WGS & Panel, False for WES)""" - no_rmask: bool = False - """Skip RepeatMasker correction.""" - - -class CnvkitFix(SnappyModel): - cluster: bool = False - """Compare and use cluster-specific values present in the reference profile.""" - no_gc: bool = False - """Skip GC correction.""" - no_edge: bool = False - """Skip edge correction.""" - no_rmask: bool = False - """Skip RepeatMasker correction.""" - - -class CnvkitSegment(SnappyModel): - method: CnvkitSegmentationMethod = CnvkitSegmentationMethod.CBS - """Segmentation method, or 'NONE' for chromosome arm-level averages as segments""" - threshold: float = 0.0001 - """Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size.""" - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - drop_outliers: float = 10 - """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" - smooth_cbs: bool = False - - min_variant_depth: float = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" - zygocity_freq: float = 0.25 - """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" +class PanelOfNormalsCnvkit(PanelOfNormals): + path_targets: str | None = None + """Path to target file (used only when pon is obtained from file, taken from pipeline step otherwise)""" + path_antitargets: str | None = None + """Path to antitarget file (used only when pon is obtained from file, taken from pipeline step otherwise)""" @model_validator(mode="after") - def ensure_smooth_for_cbs_only(self) -> typing.Self: - if self.smooth_cbs and self.method != CnvkitSegmentationMethod.CBS: - raise ValueError("'smooth_cbs' option can be used only with 'CBS' segmentation method") + def ensure_paths_target_antitarget(self): + if self.enabled and self.source == PanelOfNormalsOrigin.FILE: + if self.path_targets is None or self.path_antitargets is None: + raise ValueError( + "When using a previous pon, target & antitarget files must be defined" + ) return self -class CnvkitCall(SnappyModel): - method: CnvkitCallingMethod = CnvkitCallingMethod.THRESHOLD - """Calling method.""" - thresholds: str | None = None - """Hard thresholds for calling each integer copy number, separated by commas""" - center: CnvkitCenterMethod | None = CnvkitCenterMethod.MEDIAN - """Re-center the log2 ratio values using this estimator of the center or average value. ('median' if no argument given.)""" - center_at: float | None = None - """Subtract a constant number from all log2 ratios. For "manual" re-centering.""" - filter: CnvkitFilterMethod | None = None - """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" - ploidy: float | None = 2 - """Ploidy of the sample cells.""" - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - - min_variant_depth: float = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" - zygocity_freq: float = 0.25 - """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" - - -class CnvkitBintest(SnappyModel): - alpha: float = 0.005 - """Significance threhold.""" - target: bool = False - """Test target bins only; ignore off-target bins.""" - - -class CnvkitPlotDiagram(SnappyModel): - threshold: float = 0.5 - """Copy number change threshold to label genes.""" - min_probes: int = 3 - """Minimum number of covered probes to label a gene.""" - no_shift_xy: bool = False - - -class CnvkitPlotScatter(SnappyModel): - antitarget_marker: str | None = None - """Plot antitargets using this symbol when plotting in a selected chromosomal region.""" - by_bin: bool = False - """Plot data x-coordinates by bin indices instead of genomic coordinates.""" - segment_color: str | None = None - """Plot segment lines in this color. Value can be any string accepted by matplotlib.""" - trend: bool = False - """Draw a smoothed local trendline on the scatter plot.""" - y_max: float | None = None - """y-axis upper limit.""" - y_min: float | None = None - """y-axis lower limit.""" - fig_size: typing.Tuple[float, float] | None = None - """Width and height of the plot in inches.""" - - min_variant_depth: float = 20 - """Minimum read depth for a SNV to be displayed in the b-allele frequency calculation.""" - zygocity_freq: float = 0.25 +class VariantCnvkit(Variant): + min_variant_depth: int = 20 + """Minimum read depth for a SNV to be displayed in the b-allele frequency plot.""" + zygocity_freq: float | None = None """Ignore VCF's genotypes (GT field) and instead infer zygosity from allele frequencies.""" -class CnvkitPlot(SnappyModel): - diagram: CnvkitPlotDiagram = CnvkitPlotDiagram() - scatter: CnvkitPlotScatter = CnvkitPlotScatter() - - -class CnvkitReportMetrics(SnappyModel): - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - - -class CnvkitReportSegmetrics(SnappyModel): - drop_low_coverage: bool = False - """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - alpha: float = 0.05 - """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" - bootstrap: int = 100 - """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" - - -class CnvkitReport(enum.StrEnum): - METRICS = "metrics" - SEGMETRICS = "segmetrics" - - -class Cnvkit(SnappyModel): - panel_of_normals: PanelOfNormals | None = None +class Cnvkit(CnvkitGeneric): + panel_of_normals: PanelOfNormalsCnvkit = PanelOfNormalsCnvkit() - variants: VariantTool | None = None - - purity: Purity + path_target_interval_list_mapping: list[LibraryKitEntry] = [] """ - When present, purity estimates can be used for calling segments. The requested tool must be configured. - Or the purity can be provided in the samplesheet, as an extra information attached to the library. - - Note that PureCN cannot be used to estimate purity for WGS samples (because PureCN is WES & Panel-only). - TODO: This should be tested by a validation method, I don't know how to do (Till help!!) - TODO: The exact name is not yet set. + Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect) """ - access: CnvkitAccess = CnvkitAccess() - target: CnvkitTarget = CnvkitTarget() - antitarget: CnvkitAntitarget = CnvkitAntitarget() - coverage: CnvkitCoverage = CnvkitCoverage() + somatic_variant_calling: VariantCnvkit = VariantCnvkit() - reference: CnvkitReference | None = None + somatic_purity_ploidy_estimate: Purity = Purity() @model_validator(mode="after") - def set_default_reference(self) -> typing.Self: - if self.reference is None and not self.panel_of_normals.enabled: - self.reference = CnvkitReference() + def ensure_purity_not_auto(self): + if self.somatic_purity_ploidy_estimate.source == PurityOrigin.AUTOMATIC: + raise ValueError("Cnvkit cannot compute purity/ploidy by itself") return self - fix: CnvkitFix = CnvkitFix() - segment: CnvkitSegment = CnvkitSegment() - call: CnvkitCall = CnvkitCall() - bintest: CnvkitBintest = CnvkitBintest() - - use_male_reference: bool = False - """Create/use a male reference. Must be identical to panel of normals creation, when using one""" - - plots: typing.List[CnvkitPlot] = [] - - reports: typing.List[CnvkitReport] = [] - metrics: CnvkitReportMetrics | None = None + sample_sex: Sex = Sex() - # @validator("metrics") - # def get_default_reference(cls, v, values) -> CnvkitReportMetrics | None: - # if v is None and "metrics" in values["reports"]: - # return CnvkitReportMetrics() - # return None - - segmetrics: CnvkitReportSegmetrics | None = None - - # @validator("segmetrics") - # def get_default_reference(cls, v, values) -> CnvkitReportSegmetrics | None: - # if v is None and "segmetrics" in values["reports"]: - # return CnvkitReportSegmetrics() - # return None + path_access: str | None = None + """Overrides access when not None""" class SomaticCnvCalling(SnappyStepModel): - path_ngs_mapping: str + path_ngs_mapping: str = "../ngs_mapping" """Path to bam files""" tools: Tools """Tools for WGS & WES data""" - path_target_interval_list_mapping: typing.List[LibraryKitDefinition] | None = None - - sex: Sex = Sex.DIPLOID_ONLY - - cnvkit: Cnvkit + cnvkit: Cnvkit | None = None purecn: PureCn | None = None sequenza: Sequenza | None = None control_freec: ControlFreec | None = None - - mutect2: Mutect2 | None = None - - default_ploidy: float | None = None - - # @model_validator(mode="after") - # def ensure_single_pon_step(self) -> typing.Self: - # """ - # I am not sure this is absolutely required. - # I am trying to avoid registering the panel_of_normals step when initializing SomaticCnvCalling - # """ - # pon_steps = set() - # for tool in itertools.chain(self.tools.wgs, self.tools.wes): - # tool_config = getattr(self, tool) - # if ( - # tool_config - # and getattr(tool_config, "use_panel_of_normals") - # and tool_config.use_panel_of_normals == PanelOfNormalsUse.PREVIOUS_STEP - # ): - # pon_steps.add(str(tool_config.panel_of_normals.panel_of_normals)) - # if len(pon_steps) > 1: - # raise ValueError("Too many panel_of_normals steps") - # return self diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index c93c981c5..2c2d8faf8 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -20,13 +20,13 @@ cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ - --min-gap-size {args[min_gap_size]} \ - {exclude} \ + {min_gap_size} {exclude} \ {args[reference]} """.format( snakemake=snakemake, args=args, - exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]) if "exclude" in args else "", + min_gap_size=f"--min-gap-size {args['min-gap-size']}" if args.get("min-gap-size", None) is not None else "", + exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]), ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 596626831..c7009727f 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -17,19 +17,16 @@ args = snakemake.params.get("args", {}) -if snakemake.input.get("target", "") != "": - cmd = r""" - cnvkit.py antitarget \ - -o {snakemake.output.antitarget} \ - --avg-size {args['avg_size']} --min-size {args['min_size']} \ - {access} \ - {snakemake.input.target} - """.format( - snakemake=snakemake, - args=args, - access=f"--access {args['access']}" if "access" in args else "", - ) -else: - cmd = f"touch {snakemake.output.antitarget}" +cmd = r""" +cnvkit.py antitarget \ + -o {snakemake.output.antitarget} \ + --avg-size {args[avg-size]} {min_size} + --access {files[access]} \ + {args[target]} +""".format( + snakemake=snakemake, + args=args, + min_size=f"--min-size {args['min-size']}" if args.get("min-size") is not None else "", +) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index ce913b505..9711475ed 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -21,15 +21,15 @@ cnvkit.py autobin --method {args[method]} \ {out_target} {out_antitarget} \ {access} {target} \ - --bp-per-bin {args[bp_per_bin]} \ - {snakemake.input.bams} \ + --bp-per-bin {args[bp-per-bin]} \ + {args[bams]} \ > {snakemake.output.result} """.format( snakemake=snakemake, args=args, out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", - access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) else "", + access=f"--access {args['access']}" if "access" in args else "", target=f"--targets {args['target']}" if "target" in args else "", ) diff --git a/snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml b/snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/bintest/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py new file mode 100644 index 000000000..0ea46cf30 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py bintest""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py bintest \ + -o {snakemake.output.tests} \ + --segment {args[segments]} \ + --alpha {args[alpha]} {target} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + target=f"--target" if args.get("target", False) else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index c77d8863b..ca0769c0b 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -1,38 +1,32 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py call""" +import os import re +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -class CnvkitWrapperCall(CnvkitWrapper): - PURITY_PATTERN = re.compile("^Purity: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") - PLOIDY_PATTERN = re.compile("^Ploidy: +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") - - def preamble(self): - if "purity" in self.snakemake.input: - with open(self.snakemake.input.purity, "rt") as f: - for line in f: - m = CnvkitWrapperCall.PURITY_PATTERN.match(line.strip()) - if m: - self.purity = float(m.groups()[1]) - else: - m = CnvkitWrapperCall.PLOIDY_PATTERN.match(line.strip()) - if m: - self.ploidy = float(m.groups()[1]) - else: - self.purity = self.snakemake.params.purity if "purity" in self.snakemake.params else None - self.ploidy = self.snakemake.params.ploidy if "ploidy" in self.snakemake.params else None - - self.cmd = self.cmd.format(purity=self.purity, ploidy=self.ploidy) - -if "variants" in snakemake.input: +args = snakemake.params.get("args", {}) + +PATTERN = re.compile("^(Purity|Ploidy): +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") + + +if "variants" in args: variants = r""" - ---vcf {snakemake.input.variants} \ - {snakemake.params.sample_id} {snakemake.params.normal_id} \ - {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + ---vcf {args[variants]} \ + --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ + --min-variant-depth {args['min-variant-depth']} {zygocity_freq} """.format( snakemake=snakemake, + args=args, + zygocity_freq=f"--zygocity-freq {args['zygocity-freq']}" if args.get("zygocity-freq", None) is not None else "", ) else: variants = "" @@ -40,21 +34,25 @@ def preamble(self): cmd = r""" cnvkit.py call \ -o {snakemake.output.calls} \ - --method {snakemake.params.method} --thresholds={snakemake.params.thresholds} \ - --filter {snakemake.params.filter} \ - {center} \ - {drop_low_coverage} \ - {sample_sex} {male_reference} \ + --method {args['method']} {thresholds} \ + {filter} \ + {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} \ + {purity} {ploidy} \ {variants} \ - {{purity}} {{ploidy}} \ - {snakemake.input.segments} + {args['segments']} """.format( snakemake=snakemake, - center=f"--center-at {snakemake.params.center_at}" if "center_at" in snakemake.params else f"--center {snakemake.params.center}", - drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", - sample_sex=f"--sample-sex {snakemake.params.sample_sex}" if "sample_sex" in snakemake.params else "", - male_reference="--male-reference" if snakemake.params.male_reference else "", + args=args, variants=variants, + purity=f"--purity {args['purity']}" if args.get("purity", None) is not None else "", + ploidy=f"--ploidy {args['ploidy']}" if args.get("ploidy", None) is not None else "", + thresholds="--thresholds={}".format(",".join(map(str, args["thresholds"]))) if len(args.get("thresholds", [])) > 0 else "", + filter=f"--filter {args['filter']}" if args.get("filter", None) is not None else "", + center=f"--center {args['center']}" if args.get("center", None) is not None else "", + center_at=f"--center-at {args['center-at']}" if args.get("center-at", None) is not None else "", + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", + male_reference=f"--male-reference" if args.get("male-reference", False) else "", ) -CnvkitWrapperCall(snakemake, cmd).run() +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index a71ef8e8e..cef6277bd 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -20,13 +20,13 @@ cmd = r""" cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {args[reference]} \ + --fasta {args['reference']} \ --min-mapq {args[min_mapq]} {count} \ - {snakemake.input.bam} {snakemake.input.intervals} + {args['bam']} {args['intervals']} """.format( snakemake=snakemake, args=args, - count="--count" if "count" in args else "", + count="--count" if args.get("count", False) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 97387dbcb..554a87222 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -1,24 +1,35 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py fix""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" +args = snakemake.params.get("args", {}) + cmd = r""" cnvkit.py fix \ - -o {snakemake.output.coverage} \ - {cluster} {snakemake.params.sample_id} \ + -o {snakemake.output.ratios} \ + {cluster} --sample-id {args['sample-id']} \ {no_gc} {no_edge} {no_rmask} \ - {snakemake.input.target} {antitarget} {snakemake.input.reference} + {args['target']} {antitarget} {args['reference']} """.format( snakemake=snakemake, - cluster="--cluster" if snakemake.params.cluster else "", - no_gc="--no-gc" if snakemake.params.no_gc else "", - no_edge="--no-edge" if snakemake.params.no_edge else "", - no_rmask="--no-rmask" if snakemake.params.no_rmask else "", - antitarget=f"{snakemake.input.antitarget}" if "antitarget" in snakemake.input else "", + cluster="--cluster" if args.get("cluster", False) else "", + no_gc="--no-gc" if args.get("no-gc", False) else "", + no_edge="--no-edge" if args.get("no-edge", False) else "", + no_rmask="--no-rmask" if args.get("no-rmask", False) else "", + antitarget=f"{args['antitarget']}" if "antitarget" in args else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml b/snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py new file mode 100644 index 000000000..a386a2dec --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py scatter""" + +import os +import re +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +if "variants" in args: + variants = r""" + ---vcf {args[variants]} \ + --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ + --min-variant-depth {args['min-variant-depth']} {zygocity_freq} + """.format( + snakemake=snakemake, + args=args, + zygocity_freq=f"--zygocity-freq {args['zygocity-freq']}" if args.get("zygocity-freq", None) is not None else "", + ) +else: + variants = "" + +cmd = r""" +cnvkit.py scatter \ + -o {snakemake.output.plot} \ + --segment {args['segments']} \ + {chromosome} {gene} {range_list} \ + --width {args['width']} \ + --antitarget-marker {args['antitarget-marker']} --segment-color {args['segment-color']} \ + {by_bin} {trend} --title {args['title']} \ + {y_min} {y_max} {fig_size} \ + {variants} \ + {args['ratios']} +""".format( + snakemake=snakemake, + args=args, + variants=variants, + chromosome=f"--chromosome {args['chromosome']}" if args.get("chromosome", None) is not None else "", + gene=f"--gene {args['gene']}" if args.get("gene", None) is not None else "", + range_list=f"--range-list {args['range-list']}" if args.get("range-list", None) is not None else "", + by_bin="--by-bin" if args.get("by-bin", False) else "", + trend="--trend" if args.get("trend", False) else "", + y_min=f"--y-min {args['y-min']}" if args.get("y-min", None) is not None else "", + y_max=f"--y-max {args['y-max']}" if args.get("y-max", None) is not None else "", + fig_size="--fig-size {}".format(" ".join(map(str, args['fig-size']))) if args.get("fig-size", None) is not None else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 82f71ce41..7d843f650 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -17,9 +17,12 @@ args = snakemake.params.get("args", {}) +target = f"--target {args['target']}" if "target" in args else "" +antitarget = f"--antitarget {args['antitarget']}" if "antitarget" in args else "" + cmd = r""" cnvkit.py reference \ - -o {snakemake.output.panel} \ + -o {snakemake.output.reference} \ --fasta {args[reference]} \ {cluster} {min_cluster_size} \ {sample_sex} {male_reference} {diploid_parx_genome} \ @@ -28,17 +31,17 @@ """.format( snakemake=snakemake, args=args, - cluster="--cluster" if "cluster" in args else "", - min_cluster_size=f"--min-cluster-size {args['min_cluster_size']}" if "cluster" in args and "min_cluster_size" in args else "", - no_gc="--no-gc" if "no_gc" in args else "", - no_edge="--no-edge" if "no_edge" in args else "", - no_rmask="--no-rmask" if "no_rmask" in args else "", - sample_sex=f"--sample-sex {args['sample_sex']}" if "sample_sex" in args else "", - male_reference="--male-reference" if "male_reference" in args else "", - diploid_parx_genome=f"--diploid_parx_genome {args['diploid_parx_genome']}" if "diploid_parx_genome" in args else "", - target=f"--target {snakemake.input.target}" if snakemake.input.get("target", None) else "", - antitarget=f"--antitarget {snakemake.input.antitarget}" if snakemake.input.get("antitarget", None) else "", - normals=" ".join(snakemake.input.normals) if snakemake.input.get("normals", None) else "", + target=target, + antitarget=antitarget, + normals=" ".join(args["normals"]) if len(args.get("normals", [])) > 0 else "", + cluster="--cluster" if args.get("cluster", False) else "", + male_reference="--male-reference" if args.get("male-reference", False) else "", + no_gc="--no-gc" if args.get("no-gc", False) else "", + no_edge="--no-edge" if args.get("no-edge", False) else "", + no_rmask="--no-rmask" if args.get("no-rmask", False) else "", + min_cluster_size=f"--min-cluster-size {args['min-cluster-size']}" if "min-cluster-size" in args else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "" ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py new file mode 100644 index 000000000..24a3f5a7e --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py genemetrics""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py genemetrics \ + -o {snakemake.output.report} \ + --segment {args['segments']} \ + --threshold {args['threshold']} --min-probes {args['min-probes']} \ + {drop_low_coverage} {male_reference} {sample_sex} {diploid_parx_genome} \ + {stats} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + male_reference="--male-reference" if args.get("male-reference", False) else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "", + stats=" ".join([f"--{stat}" for stat in args["stats"]]), +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py new file mode 100644 index 000000000..fe421a339 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py genemetrics""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py metrics \ + -o {snakemake.output.report} \ + --segment {args['segments']} \ + {drop_low_coverage} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml new file mode 120000 index 000000000..8dc548583 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/environment.yaml @@ -0,0 +1 @@ +../../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py new file mode 100644 index 000000000..d569588c2 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py segmetrics""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py segmetrics \ + -o {snakemake.output.report} \ + --segment {args['segments']} \ + --alpha {args['alpha']} --bootstrap {args['bootstrap']} {smooth_bootstrap} \ + {drop_low_coverage} \ + {stats} \ + {args[ratios]} +""".format( + snakemake=snakemake, + args=args, + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + smooth_bootstrap="--smooth-bootstrap" if args.get("smooth-bootstrap", False) else "", + stats=" ".join([f"--{stat}" for stat in args["stats"]]), +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 648e14a2a..2c02aa03c 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -1,31 +1,45 @@ # -*- coding: utf-8 -*- """Wrapper vor cnvkit.py segment""" +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper -if "variants" in snakemake.input: +args = snakemake.params.get("args", {}) + +if "variants" in args: variants = r""" - ---vcf {snakemake.input.variants} \ - {snakemake.params.sample_id} {snakemake.params.normal_id} \ - {snakemake.params.min_variant_depth} {snakemake.params.zygocity_freq} + ---vcf {args['variants']} \ + --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ + {args['min-variant-depth']} {zygocity_freq} """.format( snakemake=snakemake, + args=args, + zygocity_freq=f"--zygocity_freq {args['zygocity-freq']}" if "zygocity-freq" in args else "" ) else: variants = "" cmd = r""" -cnvkit.py segment --processes {snakemake.params.proceses} \ +cnvkit.py segment --processes {snakemake.resources._cores} \ -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ - --method {snakemake.params.method} --threshold {snakemake.params.threshold} {smooth_cbs} \ - {drop_low_coverage} --drop-outliers {snakemake.params.drop_outliers} \ + --method {args['method']} --threshold {args['threshold']} {smooth_cbs} \ + {drop_low_coverage} --drop-outliers {args['drop-outliers']} \ {variants} \ - {snakemake.input.coverage} + {args[coverage]} """.format( snakemake=snakemake, - smooth_cbs="--smooth-cbs" if snakemake.params.smooth_cbs else "", - drop_low_coverage="--drop-low-coverage" if snakemake.params.drop_low_coverage else "", + args=args, variants=variants, + smooth_cbs="--smooth-cbs" if args.get("smooth-cbs", False) else "", + drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index fe08248ff..a3f72f3b7 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -18,34 +18,18 @@ args = snakemake.params.get("args", {}) -# WGS: targets are all accessible regions, WES: targets are baits -interval = snakemake.input.access if snakemake.input.get("access", None) else args["target"] - -if snakemake.input.get("avg_size", "") != "": - pattern = re.compile("^Target:[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)[ \t]+([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)$") - with open(snakemake.input.avg_size) as f: - for line in f: - m = pattern.match(line) - if m: - avg_size = int(float(m.groups()[4])) - break -elif "avg_size" in args: - avg_size = args["avg_size"] -else: - avg_size = None - cmd = r""" cnvkit.py target \ -o {snakemake.output.target} \ - {avg_size} {split} {annotate} \ - {interval} + {avg_size} {split} {annotate} {short_names} \ + {args[interval]} """.format( snakemake=snakemake, args=args, - interval=interval, - avg_size=f"--avg-size {avg_size}" if avg_size is not None else "", - split=f"--split" if "split" in args and args["split"] else "", + avg_size=f"--avg-size {args['avg-size']}" if args['avg-size'] is not None else "", + split=f"--split" if args.get("split", False) else "", annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", + short_names="--short-names" if args.get("short-names", False) else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/tests/snappy_pipeline/workflows/conftest.py b/tests/snappy_pipeline/workflows/conftest.py index 1a99fed2a..4e7d9de5c 100644 --- a/tests/snappy_pipeline/workflows/conftest.py +++ b/tests/snappy_pipeline/workflows/conftest.py @@ -906,6 +906,41 @@ def cancer_sheet_fake_fs_path_link_in(fake_fs, cancer_sheet_tsv): return fake_fs +@pytest.fixture +def autobin_result_fake_fs(fake_fs, cancer_sheet_tsv): + """Return fake file autobin.txt""" + # Create work directory + fake_fs.fs.makedirs("/work", exist_ok=True) + # Create autobin result for the samples + tpl = "/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt" + for line in cancer_sheet_tsv.splitlines()[8:]: + (donor, sample, isTumor, assay, folder, libraryKit, extract) = line.split("\t") + if isTumor == "N": + library_name = f"{donor}-{sample}-{extract}1-{assay}1" + fake_fs.fs.create_file( + tpl.format(mapper="bwa", library_name=library_name), create_missing_dirs=True + ) + return fake_fs + + +@pytest.fixture +def purity_result_fake_fs(fake_fs, cancer_sheet_tsv): + """Return fake file purity.txt""" + # Create work directory + fake_fs.fs.makedirs("/SOMATIC_PURITY_PLOIDY_ESTIMATE/output", exist_ok=True) + # Create autobin result for the samples + tpl = "/{mapper}.{purity_tool}.{library_name}/out/{mapper}.{purity_tool}.{library_name}.txt" + for line in cancer_sheet_tsv.splitlines()[8:]: + (donor, sample, isTumor, assay, folder, libraryKit, extract) = line.split("\t") + if isTumor == "Y": + library_name = f"{donor}-{sample}-{extract}1-{assay}1" + fake_fs.fs.create_file( + tpl.format(mapper="bwa", purity_tool="ascat", library_name=library_name), + create_missing_dirs=True, + ) + return fake_fs + + @pytest.fixture def aligner_indices_fake_fs(fake_fs): """Return fake file system setup with files for aligner indices""" diff --git a/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py new file mode 100644 index 000000000..de45fb4e1 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py @@ -0,0 +1,580 @@ +# -*- coding: utf-8 -*- +"""Tests for the panel_of_normals workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards + +from snappy_pipeline.workflows.somatic_cnv_calling import SomaticCnvCallingWorkflow + +from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .conftest import patch_module_fs + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for (cancer) configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + features: + path: /path/to/annotations.gtf + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + bwa: + path_index: /path/to/bwa/index.fa + + somatic_variant_calling: + tools: ['mutect2'] + path_ngs_mapping: ../ngs_mapping + mutect2: + common_variants: /path/to/common/variants + + somatic_purity_ploidy_estimate: + tools: ['ascat'] + path_ngs_mapping: ../ngs_mapping + ascat: + b_af_loci: /path/to/locii.bed + + somatic_cnv_calling: + tools: + wgs: ['cnvkit'] + path_ngs_mapping: ../ngs_mapping + cnvkit: + diploid_parx_genome: GRCh38 + panel_of_normals: + enabled: False + somatic_variant_calling: + enabled: True + source: cohort + tool: mutect2 + path_somatic_variant_calling: ../somatic_variant_calling + somatic_purity_ploidy_estimate: + enabled: True + source: cohort + tool: ascat + segment: + threshold: 0.0001 + scatter: + enabled: true + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def somatic_cnv_calling_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + autobin_result_fake_fs, + purity_result_fake_fs, + aligner_indices_fake_fs, + mocker, +): + """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" + # Patch out file-system to enable reading autobin output + autobin_result_fake_fs.fs.create_file( + file_path="work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.autobin.txt", + contents="Target: -1 2000\n", + create_missing_dirs=True, + ) + # Patch out file-system to enable reading autobin output + purity_result_fake_fs.fs.create_file( + file_path="SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", + contents="Purity/ploidy:\t0.35\t2.2\n", + create_missing_dirs=True, + ) + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.somatic_cnv_calling", autobin_result_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variant_calling_cnvkit": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + "panel_of_normals_cnvkit": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + "somatic_purity_ploidy_estimate_cnvkit": lambda x: "SOMATIC_PURITY_PLOIDY_ESTIMATE/" + x, + } + # Construct the workflow object + return SomaticCnvCallingWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_args_access(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "access")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards), + ) + expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": []} + assert actual == expected + + +def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], + "access": "work/bwa.cnvkit/out/cnvkit.access.bed", + "method": "wgs", + "bp-per-bin": 50000, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "autobin")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "autobin")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_target(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "interval": "work/bwa.cnvkit/out/cnvkit.access.bed", + "avg-size": 2000, + "split": True, + "annotate": "/path/to/annotations.gtf", + "short-names": True, + + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "target")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + "region": "target", + } + ) + expected = { + "intervals": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.bed", + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "reference": "/path/to/ref.fa", + "min-mapq": 0, + "count": False, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "coverage")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_reference()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + "region": "target", + } + ) + expected = { + "normals": ["work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn"], + "reference": "/path/to/ref.fa", + "cluster": False, + "no-gc": False, + "no-edge": True, + "no-rmask": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "reference")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "reference")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_fix(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_fix()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "target": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.target.cnn", + "reference": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.reference.cnn", + "cluster": False, + "no-gc": False, + "no-edge": True, + "no-rmask": False, + "diploid-parx-genome": "GRCh38", + "sample-id": "P001-T1-DNA1-WGS1", + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "fix")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_segment(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_segment()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "method": "cbs", + "threshold": 0.0001, + "smooth-cbs": False, + "drop-low-coverage": False, + "drop-outliers": 10, + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "sample-id": "P001-T1-DNA1-WGS1", + "normal-id": "P001-N1-DNA1-WGS1", + "min-variant-depth": 20, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segment")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_call(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_call()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "method": "threshold", + "thresholds": [-1.1, -0.25, 0.2, 0.7], + "drop-low-coverage": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "sample-id": "P001-T1-DNA1-WGS1", + "normal-id": "P001-N1-DNA1-WGS1", + "min-variant-depth": 20, + "purity_file": "SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", + "purity": 0.35, + "ploidy": 2.2, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "call")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_bintest()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "alpha": 0.005, + "target": False, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "bintest")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "bintest")(wildcards), + ) + assert actual == expected + assert actual == expected + + +def test_cnvkit_step_part_get_args_metrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_metrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "drop-low-coverage": False, + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "metrics")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "metrics")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_segmetrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_segmetrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "drop-low-coverage": False, + "alpha": 0.05, + "bootstrap": 100, + "smooth-bootstrap": False, + "stats": ["mean", "median", "mode", "t-test", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segmetrics")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "segmetrics")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_genemetric(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_genemetrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "threshold": 0.2, + "min-probes": 3, + "drop-low-coverage": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + "alpha": 0.05, + "bootstrap": 100, + "stats": ["mean", "median", "mode", "ttest", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "genemetrics")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "genemetrics")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_scatter()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + "contig_name": "1", + } + ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "chromosome": "1", + "width": 1000000, + "antitarget-marker": "o", + "by-bin": False, + "trend": False, + "segment-color": "darkorange", + "title": "P001-T1-DNA1-WGS1 - 1", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "sample-id": "P001-T1-DNA1-WGS1", + "normal-id": "P001-N1-DNA1-WGS1", + "min-variant-depth": 20, + "fig-size": (6.4, 4.8), + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "scatter")( + wildcards, + somatic_cnv_calling_workflow.get_input_files("cnvkit", "scatter")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_output_files() for all actions""" + actions = { + "access": {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"}, + "autobin": {"result": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt"}, + "target": {"target": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.target.bed"}, + "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "reference": {"reference": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.reference.cnn"}, + "fix": {"ratios": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr"}, + "segment": { + "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segments.cns", + "dataframe": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.rds", + }, + "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns"}, + "bintest": {"tests": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bintest.cns"}, + "metrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.metrics.tsv"}, + "genemetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.genemetrics.tsv"}, + "segmetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.segmetrics.tsv"}, + "scatter": {"plot": "work/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.scatter.{contig_name}.jpeg"}, + } + for action, result in actions.items(): + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_output_files("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for all actions""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + actions = ("autobin", "target", "reference", "fix", "segment", "call", "bintest", "metrics", "segmetrics", "genemetrics") + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}" + for action in actions: + result = {k: base_log + f".{action}.{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_access(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for access""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit/log/cnvkit.access" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "access") + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for coverage""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}coverage" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_scatter(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for coverage""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.scatter.{contig_name}" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "scatter") + assert actual == expected + + +# SomaticCnvCallingWorkflow -------------------------------------------------------------------------- + + +def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["cnvkit", "link_out"] + actual = list(sorted(somatic_cnv_calling_workflow.sub_steps.keys())) + assert actual == expected + + tumor_libraries = ("P001-T1-DNA1-WGS1", "P002-T1-DNA1-WGS1", "P002-T2-DNA1-WGS1") + + expected = [] + + # cnvkit output files + tpl = "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, ext=ext) + for ext in ("cnr", "segments.cns", "cns", "bintest.cns") + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # cnvkit log files + tpl = "output/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.{step}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, step=step, ext=ext) + for ext in ("conda_info.txt", "conda_list.txt", "log", "sh") + for step in ("fix", "segment", "call", "bintest") + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # cnvkit report files + tpl = "output/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.{step}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, step=step, ext=ext) + for ext in ("tsv",) + for step in ("metrics", "genemetrics", "segmetrics") + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # cnvkit plot files + tpl = "output/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.{step}.{contig_name}.{ext}" + expected += [ + tpl.format(mapper=mapper, library_name=library_name, step=step, contig_name=contig_name, ext=ext) + for ext in ("jpeg",) + for contig_name in ["all"] + list(map(str, range(1, 23))) + ["X", "Y"] + for step in ("scatter",) + for library_name in tumor_libraries + for mapper in ("bwa",) + ] + + # Add md5 + expected += [x + ".md5" for x in expected] + expected = list(sorted(expected)) + actual = list(sorted(somatic_cnv_calling_workflow.get_result_files())) + assert actual == expected From af902d739d3e1c737f2724e9d82ef73bb40774d5 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 27 Nov 2024 14:18:01 +0100 Subject: [PATCH 37/46] feat: somatic cnv calling for cnvkit - complete - partially tested --- snappy_pipeline/models/cnvkit.py | 48 +-- .../models/{library_kit.py => common.py} | 26 +- snappy_pipeline/models/purecn.py | 293 ++++++++++++++++++ .../workflows/somatic_cnv_calling/__init__.py | 259 ++++++++-------- .../workflows/somatic_cnv_calling/model.py | 111 +++---- .../wrappers/cnvkit/call/wrapper.py | 8 +- .../wrappers/cnvkit/coverage/wrapper.py | 6 +- .../wrappers/cnvkit/fix/wrapper.py | 14 +- .../wrappers/cnvkit/plot/scatter/wrapper.py | 16 +- .../cnvkit/report/genemetrics/wrapper.py | 6 +- .../wrappers/cnvkit/report/metrics/wrapper.py | 8 +- .../cnvkit/report/segmetrics/wrapper.py | 6 +- .../wrappers/cnvkit/segment/wrapper.py | 12 +- ... => test_workflows_somatic_cnv_calling.py} | 6 +- 14 files changed, 537 insertions(+), 282 deletions(-) rename snappy_pipeline/models/{library_kit.py => common.py} (62%) create mode 100644 snappy_pipeline/models/purecn.py rename tests/snappy_pipeline/workflows/{test_workflow_somatic_cnv_calling.py => test_workflows_somatic_cnv_calling.py} (99%) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 3e9be3f55..837f8b984 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -61,32 +61,6 @@ # } -class SexOrigin(enum.StrEnum): - AUTOMATIC = "auto" - """Sex determined from the data""" - SAMPLESHEET = "samplesheet" - """Donor sex obtained from sample sheet""" - CONFIG = "config" - """Donor sex obtained from the configuration (all donors have the same sex)""" - - -class SexValue(enum.StrEnum): - MALE = "male" - FEMALE = "female" - - -class Sex(SnappyModel): - source: SexOrigin = SexOrigin.AUTOMATIC - - sample_sex: SexValue | None = None - - @model_validator(mode="after") - def ensure_valid_sex_value(self): - if self.source == SexOrigin.CONFIG and self.sample_sex is None: - raise ValueError("No definition of donors' sex from the configuration") - return self - - class SegmentationMethod(enum.StrEnum): CBS = "cbs" FLASSO = "flasso" @@ -118,24 +92,24 @@ class CallingMethod(enum.StrEnum): class Access(SnappyModel): - exclude: list[str] = [] - """Regions accessible to mapping""" - min_gap_size: int | None = None """ - Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together. - In WGS mode, the _target_ regions are set to the accessible regions in the genome. These accessible regions can be provided by the user, or computed by the `access` module. In the latter case, the optimal bin size is computed by the `autobin` module unless this value is provided by the user. `autobin` uses the `wgs` method _only_ if the list of excluded region is empty and if the `min_gap_size` parameter remains unassigned. If any of these conditions is not met, - or if a files of accessible regions is provided by the user, then then `amplicon` method + or if a files of accessible regions is provided by the user, then the `amplicon` method is used. It is recommended to leave the excluded regions empty and not set the `min_gap_size` parameter for WGS data, unless the accessible regions are much reduced (for example excluding all intergenic regions, repeats, low complexity, ...) """ + + exclude: list[str] = [] + """Regions accessible to mapping""" + min_gap_size: int | None = None + """Minimum gap size between accessible sequence regions. Regions separated by less than this distance will be joined together.""" class Target(SnappyModel): @@ -370,16 +344,6 @@ class CnvkitToReference(SnappyModel): drop_low_coverage: bool = False """Drop very-low-coverage bins before segmentation to avoid false-positive deletions in poor-quality tumor samples.""" - @model_validator(mode="after") - def ensure_males_for_reference(self): - if ( - self.male_reference - and self.sex.source == SexOrigin.CONFIG - and self.sex.sample_sex == SexValue.FEMALE - ): - raise ValueError("Male reference requested for female cohort") - return self - class Cnvkit(CnvkitToReference): fix: Fix = Fix() diff --git a/snappy_pipeline/models/library_kit.py b/snappy_pipeline/models/common.py similarity index 62% rename from snappy_pipeline/models/library_kit.py rename to snappy_pipeline/models/common.py index bd861aa72..714135b65 100644 --- a/snappy_pipeline/models/library_kit.py +++ b/snappy_pipeline/models/common.py @@ -1,5 +1,7 @@ +import enum + from typing import Annotated -from pydantic import Field +from pydantic import Field, model_validator from snappy_pipeline.models import SnappyModel @@ -27,3 +29,25 @@ class LibraryKitEntry(SnappyModel): class LibraryKit(SnappyModel): path_target_interval_list_mapping: list[LibraryKitEntry] = [] """Connects sample-based library kit in sample sheets with corresponding bed files""" + + +class SexValue(enum.StrEnum): + MALE = "male" + FEMALE = "female" + + +class SexOrigin(enum.StrEnum): + AUTOMATIC = "auto" + SAMPLESHEET = "samplesheet" + CONFIG = "config" + + +class Sex(SnappyModel): + source: SexOrigin = SexOrigin.AUTOMATIC + default: SexValue | None = None + + @model_validator(mode="after") + def ensure_default_value(self): + if self.source == SexOrigin.CONFIG and not self.default: + raise ValueError("Undefined default sex value in configuration file") + return self diff --git a/snappy_pipeline/models/purecn.py b/snappy_pipeline/models/purecn.py new file mode 100644 index 000000000..878083413 --- /dev/null +++ b/snappy_pipeline/models/purecn.py @@ -0,0 +1,293 @@ +import enum +from typing import Annotated + +from pydantic import Field, model_validator + +from snappy_pipeline.models import SnappyModel + + +# Parameters for each action & those shared between actions +# param_table = { +# "shared": { +# "genome": Enum, +# "seed": int, +# }, +# "IntervalFile": { +# "off_target": bool, +# "average_target_width": float, +# "min_target_width": float, +# "small_targets": Enum, +# "off_target_seqlevels": Enum, +# "min_mappability": list, +# "average_reptiming_width": float, +# }, +# "Coverage": { +# "keep_duplicates": bool, +# "remove_mapq0": bool, +# "skip_gc_norm": bool, +# }, +# "NormalDB": { +# "genomicsdb_af_field": str, +# "min_normals_position_specific_fit": float, +# }, +# "PureCN": { +# "sex": Enum, +# "min_af": float, +# "error": float, +# "base_quality_offset": int, +# "min_supporting_reads": int, +# "db_info_flag": str, +# "popaf_info_field": str, +# "cosmic_cnt_info_field": str, +# "min_cosmic_cnt": int, +# "interval_padding": int, +# "min_total_counts": int, +# "min_fraction_offtarget": float, +# "fun_segmentation": Enum, +# "alpha": float, +# "undo_sd": str, +# "changpoints_penalty": int, +# "additional_cmd_args": str, +# "max_segments": int, +# "min_logr_sdev": float, +# "min_purity": float, +# "max_purity": float, +# "min_ploidy": float, +# "max_ploidy": float, +# "max_copy_number": int, +# "post_optimize": bool, +# "bootstrap_n": int, +# "speedup_heuristics": int, +# "model_homozygous": bool, +# "model": Enum, +# "max_non_clonal": float, +# "max_homozygous_loss": list, +# }, +# } + + +class Genome(enum.StrEnum): + HG18 = "hg18" + HG19 = "hg19" + HG38 = "hg38" + MM9 = "mm9" + MM10 = "mm10" + RN4 = "rn4" + RN5 = "rn5" + RN6 = "rn6" + CANFAM3 = "canFam3" + + +class SmallTargets(enum.StrEnum): + RESIZE = "resize" + DROP = "drop" + + +class OffTargetSeqLevels(enum.StrEnum): + TARGETED = "targeted" + ALL = "all" + + +class FilterMethod(enum.StrEnum): + AMPDEL = "ampdel" + CN = "cn" + CI = "ci" + SEM = "sem" + + +class CallingMethod(enum.StrEnum): + THRESHOLD = "threshold" + CLONAL = "clonal" + NONE = "none" + + +class IntervalFile(SnappyModel): + off_target: bool = False + """Include off-target regions""" + average_target_width: int = 400 + """Split large targets to approximately that size""" + min_target_width: int = 100 + """Either resize or drop targets smaller than specified""" + small_targets: SmallTargets = SmallTargets.RESIZE + """Either 'resize' or 'drop' small targets""" + average_off_target_width: int = 200000 + """Bin off-target regions to approximately that size""" + off_target_seqlevels: OffTargetSeqLevels = OffTargetSeqLevels.TARGETED + """Controls how to deal with chromosomes/contigs not found in baits""" + mappability: Annotated[ + str, + Field( + examples=[ + "/data/cephfs-1/work/groups/cubi/projects/biotools/PureCN/hg38/GCA_000001405.15_GRCh38_no_alt_analysis_set_100.bw", + "/data/cephfs-1/work/groups/cubi/projects/biotools/PureCN/hg19/wgEncodeCrgMapabilityAlign75mer.bigWig", + ] + ), + ] = "" + """``rtracklayer``-parsable file with mappability scores in 1st metadata column""" + min_mappability: tuple[float, float, float] = (0.6, 0.1, 0.7) + """Minimum mappability for on-target, off-target and chrY regions""" + reptiming: Annotated[ + str, + Field( + examples=[ + "/data/cephfs-1/work/groups/cubi/projects/biotools/PureCN/hg19/wgEncodeUwRepliSeqK562WaveSignalRep1.bigWig", + "", + ] + ), + ] = "" + """``rtracklayer``-parsable file with replication timing scores in 1st metadata column""" + average_reptiming_width: int = 100000 + """Average the replication timing data into bins of the specified size""" + exclude: str | None = None + """File parsable by rtracklayer specifying baits that should be excluded from baits file""" + + +class Coverage(SnappyModel): + keep_duplicates: bool = False + """SCount reads marked as duplicates""" + remove_mapq0: bool = False + """Not count reads marked with mapping quality 0""" + skip_gc_norm: bool = False + """Skips GC-normalization""" + + +class NormalDB(SnappyModel): + genomicsdb_af_field: str = "AF" + """Info field name where the allelic fraction is stored""" + min_normals_position_specific_fit: float = 10.0 + """Only change if you know what you are doing""" + + +class PureCNBase(SnappyModel): + genome: Genome + """Genome version. One of hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3""" + seed: int | None = None + """Seed for random number generator""" + + +class PureCNPon(PureCNBase): + intervals: IntervalFile = IntervalFile() + normaldb: NormalDB = NormalDB() + coverage: Coverage = Coverage() + + +class Variant(SnappyModel): + min_af: float = 0.03 + """minimum allelic fraction""" + snp_blacklist: str | None = None + """File parsable by rtracklayer that defines blacklisted regions""" + error: float = 0.001 + """Estimated default sequencing error rate for artifact filtering. Can be overriden by base quality scores""" + base_quality_offset: int = 1 + """Subtracts the specified value from the base quality score""" + min_supporting_reads: int | None = None + """Instead of calculating the min. number of supporting reads, use specified one""" + db_info_flag: str = "DB" + """VCF INFO flag indicating presence in common germline databases""" + popaf_info_field: str = "POP_AF" + """VCF INFO field providing population allele frequency""" + cosmic_cnt_info_field: str = "Cosmic.CNT" + """VCF INFO field providing counts in the Cosmic database""" + cosmic_vcf_file: str | None = None + """Adds a Cosmic.CNT INFO annotation using a Cosmic VCF. Added for convenience, we recommend adding annotations upstream""" + min_cosmic_cnt: int = 6 + """Min number of COSMIC hits""" + interval_padding: int = 50 + """Keep variants in the flanking region of specified size""" + + +class IntervalFilter(SnappyModel): + min_total_counts: int = 100 + """Keep only intervals with at least that many counts in both tumor and (tanget) normal""" + min_fraction_offtarget: float = 0.05 + """Ignore off-target internals when only the specified fraction of all intervals are off-target intervals""" + + +class SegmentationMethod(enum.StrEnum): + CBS = "CBS" + PSCBS = "PSCBS" + GATK4 = "GATK4" + HCLUST = "Hclust" + + +class Segmentation(SnappyModel): + enabled: bool = True + method: SegmentationMethod = SegmentationMethod.CBS + alpha: float = 0.005 + """Significance of breakpoints""" + undo_sd: str | None = None + """DNAcopy undo.SD argument. If None, tries to find a sensible default""" + changepoints_penalty: float | None = None + """GATK4 ModelSegments --number-of-changepoints-penalty-factor argument. If NULL, tries to find a sensible default""" + additional_cmd_args: str = "" + """Used in GATK4 segmentation function to add additional ModelSegments arguments""" + max_segments: int = 300 + """Flag noisy samples with many segments""" + min_logr_sdev: float = 0.15 + """Set minimum log-ratio standard deviation to this value. Useful when uncorrected biases exceed the log-ratio noise""" + + seg_file: str | None = None + """External segmentation file (from cnvkit, for example)""" + + @model_validator(mode="after") + def ensure_args_gatk4(self): + if self.changepoints_penalty or self.additional_cmd_args: + if self.method != SegmentationMethod.GATK4: + raise ValueError( + "Segmentation method 'GATK4' must be selected when parameters 'changepoints_penalty' or 'additional_cmd_args' are set" + ) + return self + + @model_validator(mode="after") + def ensure_segmentation(self): + if self.enabled and self.seg_file is not None: + raise ValueError("Segmentation cannot be enabled when a segmentation file is provided") + if not self.enabled and not self.seg_file: + raise ValueError("Segmentation must be either enabled or provided using 'seg_file'") + return self + + +class Model(enum.StrEnum): + BETA = "beta" + BETABIN = "betabin" + + +class PureCN(PureCNBase): + min_mapq: int = 0 + """Minimum mapping quality score (phred scale 0-60) to count a read for coverage depth.""" + min_purity: float = 0.15 + """Minimum considered purity""" + max_purity: float = 0.95 + """Maximum considered purity""" + min_ploidy: float = 1.4 + """Minimum considered loidy""" + max_ploidy: float = 6.0 + """Maximum considered ploidy""" + max_copy_number: int = 7 + """Maximum allele-specific integer copy number""" + post_optimize: bool = False + """Post-optimization""" + bootstrap_n: int = 0 + """Number of bootstrap replicates""" + speedup_heuristics: float = 2.0 + """Tries to avoid spending computation time on unlikely local optima""" + homozygous_model: bool = False + """Model homozygous variants in very pure samples. Should be 'model_homozygous', but model_* doesn't play well with pytest""" + fit_model: Model = Model.BETA + """Model used to fit variants. Either beta or betabin. Should be 'model', but model_* doesn't play well with pytest""" + log_ratio_calibration: float = 0.1 + """Parameter defining the extend to which log-ratios might be miscalibrated""" + max_non_clonal: float = 0.2 + """Maximum genomic fraction assigned to a subclonal copy number state""" + max_homozygous_loss: tuple[float, float] = (0.05, 10000000.0) + """Maximum genomic fraction assigned to a complete loss and maximum size of a loss in bp""" + + log_ratio_file: str | None = None + """External log2 copy number ratio file""" + + # TODO: allow PureCN to merge all tumors from the same donor + additional_tumors: list[str] = [] + """tumor coverages from additional biopsies from the SAME patient, GC-normalized""" + + interval_filter: IntervalFilter = IntervalFilter() + segmentation: Segmentation = Segmentation() diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 7397391ef..84eba37a8 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -2,13 +2,12 @@ """Implementation of the ``somatic_cnv_calling`` step This step allows for the detection of CNV events for cancer samples from targeted sequenced (e.g., -exomes or large panels) or whole genome sequencing. +exomes or large panels) or whole genome sequencing. Panel sequencing is not implemented yet, it might be in a later release. The wrapped tools start from the aligned reads (thus off ``ngs_mapping``) and generate CNV calls for somatic variants. The wrapped tools implement different strategies. Some work "reference free" and just use the somatic BAM files for their input, some work in "matched cancer normal mode" and need the cancer -and normal BAM files, others again cancer BAM files, and additionally a -set of non-cancer BAM files for their background (the panel of normals). +and normal BAM files, and finally others use a set of non-cancer BAM files for their background (the panel of normals). Some tools may also use germline & somatic variants to estimate allele-specific copy number changes, and resolve loss-of-heterozygocity. In this case, the small variants need to be computed separately from the ``somatic_variant_calling`` step. @@ -25,45 +24,53 @@ Tools that use panel of normals can obtain their input in two different ways: -- A static file, from another cohort or from public datasets. +- A static file or files, from another cohort or from public datasets. In this case, the user is responsible to make sure that the data & methods used to create the panel are compatible to the cohort's. - The ``panel_of_normals`` step. - The panel will be created if necessary, using the same conditions that for the cohort (genome release, exome kit assignment, ...) + The panel will be created if necessary, using data from normal samples from the cohort. -When requested, the optional germline and somatic small variant calls are created using a modified version of the ``somatic_variant_calling`` step. -The ``somatic__cnv_calling`` step generates the small variants (TODO: how exactly) and stores them (TODO: where exactly). +When requested, the optional germline and somatic small variant calls are created in the ``somatic_variant_calling`` step. +Once again, it is the responsability of the user to make sure that variants creeated in that way are suitable for CNV calling. Likewise, purity estimations can be automatically computed by the ``somatic__cnv_calling`` step, -to supplement or replace the estimations that may be provided in the samplesheet. +when estimations are not provided in the samplesheet or configuration file. =========== Step Output =========== TODO: The whole section of output needs revision. Main question is: what is the best format to encode CNAs? +``vcf`` is an possibility, the benefits are a (more or less) well-defined format, but the major drawback is +that (as far as I know), most CNV analysis tools (from ``R`` in particular) don't recognize this format (for CNAs). -There is no widely used standard to report copy number alterations. -In absence of a better solution, all CNV tools implemented in somatic pipeline output the segmentation table loosely following the `DNAcopy format `_.` -The copy number call may or may not be present, and the chromosome number is replaced by its name. -The segmentation output is in file ``output/../out/.._dnacopy.seg``. +Currently, the only implemented tool is ``cnvkit``. Therefore, the ``cnvkit`` output is left as produced by the software. + +------ +cnvkit +------ + +The structure of the output is: :: output/ +-- bwa.cnvkit.P001-N1-DNA1-WES1 | |-- out - | | |-- bwa.cnvkitP001-N1-DNA1-WES1_dnacopy.seg + | | |-- bwa.cnvkitP001-N1-DNA1-WES1. [...] -Note that tool ``cnvetti`` doesn't follow the snappy convention above: -the tool name is followed by an underscore & the action, where the action is one of ``coverage``, ``segment`` and ``postprocess``. -For example, the output directory would contain a directory named ``bwa.cnvetti_coverage.P002-T1-DNA1-WES1``. +There are 4 main outputs: -.. note:: Tool-Specific Output +- The ratios (extension ``.cnr``) contains the ratio of expected coverage between tumor and the reference + in each bin, or logarithmic scale. + This can be used to examine the data or experiment with different segmentation algorithms. +- The segments (extension ``.segments.cns``) contains the output of the segmentation. A single log2 ratio value is + attributed to each segment. + The segmentation covers most of the part of the genome accessible to mapping. +- The calls (extension ``calls.cns``) contains only the non-diploid segments, called after thresholding. +- The results of differential coverage tests by bins (extension ``bintest.cns``). Only significant tests are listed. - Each tool produces its own set of outputs, generally not in standard format. - Some of these files are linked from ``work`` to ``output``, but not necessarily all of them. - Some tools (for example ``cnvkit``) also produces a report, with tables and figures. +Reports & plots are also available on user's request, found in ``report`` and ``plot`` sub-directories. ===================== @@ -72,27 +79,24 @@ The default configuration is as follows. -.. include:: DEFAULT_CONFIG_somatic_targeted_seq_cnv_calling.rst +.. include:: DEFAULT_CONFIG_somatic_cnv_calling.rst ===================================== Available Somatic Targeted CNV Caller ===================================== - ``cnvkit`` (for both WGS & WES) -- ``sequenza`` (only WES) -- ``purecn`` (only WES) -- ``Control-FREEC`` (only WGS - this tools might not be supported) -================================ -Logic of the step for ``cnvkit`` -================================ +=========================================== +Description of the step for ``cnvkit`` tool +=========================================== -------- Overview -------- ``cnvkit`` was designed to call CNV on whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), -and the _antitargets_ (those regions outside of enrichment). +and the _antitargets_ (those regions accessible for CNV calling, but outside of enrichment). The coverage of _targets_ and _antitargets_ are expected to be very different, but there is still information to be gained in the _antitarget_ regions, albeit at a much lower resolution than for _target_ regions. @@ -100,67 +104,93 @@ ``cnvkit`` was later used with some success on whole genome data. WGS data was defined as _target_ regions covering the whole genome, with empty _antitarget_ regions. ------------------------- -Sample-independent files ------------------------- - -``cvnkit`` allows the user to define _accessible_ regions (_via_ the ``access`` bed file). -This excludes repeats, low complexity or PAR regions, that cannot be properly mapped, and therefore used for CNV calling. - -For exome data, the _target_ regions are supposed to be well curated, so they are not affected by the _access_ regions. -The _antitarget_ regions, however, are only defined within _accessible_ regions. -For WGS data, the _antitarget_ regions are empty, and the _target_ regions are set to the _accessible_ regions, when present. -Even in the absence of user-defined _accessible_ regions, the _target_ and _antitarget_ regions will not contain long ``N`` sequences. - -Finally, the pipeline builds separates ``bed`` files for _target_ and _antitarget_ regions, for each exome kit present in the cohort, -and for WGS data if there is any. - ---------- -Reference ---------- - -The ``cnvkit`` authors recommend to use a panel of normals to normalize the coverage over bins. -This is usually created by running the ``panel_of_normals`` step. -The ``somatic_cnv_calling`` step will create a reference (panel of normals) if requested. -Otherwise, it is possible to use references created for different cohorts, but the user -must ensure that the data & methods used for the current cohort and to create the reference are compatible. -In particular, the exome enrichment kit must be identical, and the sex of the donors should be -similar (not to use a female-only reference for a male cohort, for example). - -If there are not enough normal samples to create such a reference, the corresponding normal sample -can be used, in a normal/tumor pair setting similar to the somatic small variant calling situation. +.. tip:: For Agilent kits, the ``cnvkit`` authors recommend to use baits as targets. Baits are in the ``*_Covered.bed`` file. + +--------------------------------- +Regions accessible to CNV calling +--------------------------------- + +``cnvkit`` needs to know about the regions of the genome accessible to CNV calling. +Typically, regions masked with ``N`` are excluded, but a user may also want to exclude +repeats, segmental duplications, or low complexity regions. + +There are multiple ways to get this information: + +1. The user can provide a ``bed`` file detailing accessible regions, using the ``path_access`` option in the ``access`` part of the configuration. +2. The user can specifically exclude regions using the ``exclude`` option is the ``access`` part of the configuration. + In this case, the pipeline will create an accessible regions file from the whole genome and the excluded parts. +3. Otherwise, the pipeline creates the accessible regions file from the reference genome sequence, by removing + only parts masked with ``N``. + +----------------------- +The reference coverage +----------------------- + +``cnvkit`` build a reference coverage to compensate locii effects when assessing coverage changes in tumor samples. +Because this reference is best constructed from multiple normal samples, the pipeline implements it under the +``panel_of_normals`` section. +The pipeline offers 4 different ways to built this reference: + +1. ``cohort``: the reference is taken from the pipeline's ``panel_of_normals`` step. + If it isn't there, it will be created, according to its configuration (which must be present). + The ``cnvkit`` authors suggest that `10 to 20 normal samples `_ + are sufficient to build a good reference. When there are more samples, the selected one should be taken from those with average file size. +2. ``file``: the reference is taken from another panel of normals, possibly from another cohort or from public data. + Beside the reference coverage itself, the target and antitarget bed files must also be provided. + Note that it is the user's responsability to make sure that the panel of normal is suitable for the cohort. +3. ``paired``: the reference is built only from one normal sample, paired with the tumor. + It is _not_ recommended by the ``cnvkit`` authors, but can be beneficial in some circumstances + (for example experimental designs with treated cell lines). +4. ``flat``: a _flat_ reference is computed, which discards locus-specific effects. + It should only be used when there are no normals nor suitable panel, or for benchmarking purposes. + +------------------ +WGS-specific notes +------------------ + +In WGS mode, the _antitarget_ regions (defined as accessible regions without target regions) are empty. + +But _target_ regions need to be carefully selected: the size of the bins used to identify CNAs must be +selected so that discovery of focal events is possible. +When the reference is taken from the current cohort, or from another panel of normals, the target regions +are taken from either panel of normals, and no other consideration is necessary. +But otherwise, it must be computed provided by the user (configuration option ``avg_size`` in the ``target`` section), +or from the available data. + +In the latter case, the algorithm is as follows: + +- When no normal sample data is available, then ``cvnkit`` default value is used. +- Otherwise, the value is computed by ``cnvkit``'s ``autobin`` module. + It is run in ``wgs`` mode if the access regions cover the complete genome + (no access file provided, no exclude files, and the ``min_gap_size`` parameter not set), + else it is run in ``amplicon`` mode. + Note that the latter should only be used when user-defined accessible regions are quite restricted + (limited to protein-coding exons devoid of repeats or low complexity regions, for example). + +----------------------------- +Other notes on implementation +----------------------------- + +.. note:: CNA calling on panel data is not implemented yet, even though ``cnvkit`` allows it in principle. + +.. note:: The current pipeline tries to replicate the behaviour of the ``batch`` module of ``cnvkit``, + while keeping the flexibility to diverge from it. + In particular, the possibility of obtaining the reference coverage from the paired normal is implemented. + +.. note:: The current implementation doesn't allow to mix multiple exome enrichment kits. + Future versions will hopefully lift this restriction. However, mixing WES, WGS & possibly panel data is + more challenging, and is not on the roadmap for future improvements. -In case no normals are available at all, a flat prior can be used. - ------------- -Calling CNVs ------------- - -The _target_ and _antitarget_ ``bed`` files created in the earlier sub-steps are used as input, -based on the exome kit (or WGS status). - -The coverage is computed for the tumor sample, and normalised using the reference. -As seen previously, the reference can be either exome kit-based, or sample-specific. - -The normalised coverage is the segmented, and copy numbers are called, optionally using -small variants and/or purity estimates. - -If B-allele fractions are used, the pipeline will create the small variants, only for samples -with a corresponding normal. -If purity is used, the user can choose to override the values in the sample sheet (when present) -with the output of the tool of her choice. """ import os import os.path import re -from copy import deepcopy -from enum import Enum -from typing import Callable, Iterator, Iterable, NamedTuple, Any +from typing import Callable, NamedTuple, Any -from biomedsheets.models import BioEntity, BioSample, TestSample, NGSLibrary -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background -from biomedsheets.io_tsv.base import LIBRARY_TYPES, LIBRARY_TO_EXTRACTION, EXTRACTION_TYPE_DNA +from biomedsheets.models import NGSLibrary +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions +from biomedsheets.io_tsv.base import LIBRARY_TO_EXTRACTION, EXTRACTION_TYPE_DNA from snakemake.io import OutputFiles, Wildcards, InputFiles from snappy_pipeline.utils import dictify @@ -172,37 +202,18 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_pipeline.models.common import Sex, SexOrigin, SexValue from snappy_pipeline.models.cnvkit import SegmentationMethod as CnvkitSegmentationMethod from .model import SomaticCnvCalling as SomaticCnvCallingConfigModel from .model import Cnvkit as CnvkitConfig -from .model import Sex, SexOrigin, SexValue, PanelOfNormalsOrigin, PurityOrigin, VariantOrigin +from .model import PanelOfNormalsOrigin, PurityOrigin, VariantOrigin __author__ = "Eric Blanc " #: Default configuration for the somatic_targeted_seq_cnv_calling step DEFAULT_CONFIG = SomaticCnvCallingConfigModel.default_config_yaml_string() -#: JSON key for "isCancer" -KEY_IS_CANCER = "isCancer" - -#: Value for "libraryType" is whole exome sequencing -VALUE_WES = "WES" - -#: Value for "libraryType" is panel sequencing -VALUE_PANEL = "Panel-seq" - -#: Values for targeted sequencing -VALUES_TARGETED_SEQ = (VALUE_WES, VALUE_PANEL) - -#: Standard key/extension values for BCF files -BCF_KEY_EXTS = ( - ("bcf", ".bcf"), - ("bcf_md5", ".bcf.md5"), - ("bcf_csi", ".bcf.csi"), - ("bcf_csi_md5", ".bcf.csi.md5"), -) - class SomaticCnvCallingStepPart(BaseStepPart): """Shared code for all caller classes in somatic_targeted_seq_cnv_calling""" @@ -210,19 +221,6 @@ class SomaticCnvCallingStepPart(BaseStepPart): def __init__(self, parent: "SomaticCnvCallingWorkflow"): super().__init__(parent) - @staticmethod - @dictify - def _get_log_file_from_prefix(prefix: str) -> Iterator[dict[str, str]]: - key_ext = ( - ("log", ".log"), - ("sh", ".sh"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), - ) - for key, ext in key_ext: - yield key, prefix + ext - yield key + "_md5", prefix + ext + ".md5" - class CnvKitStepPart(SomaticCnvCallingStepPart): """Perform somatic targeted CNV calling using cnvkit""" @@ -273,9 +271,7 @@ def __init__(self, parent: SomaticCnvCallingStepPart): self.tumors = {x.library.name: x for x in self.parent.tumors[self.libraryKit]} self.cfg: CnvkitConfig = self.config.get(self.name) - self.pon_source = ( - self.cfg.panel_of_normals.source if self.cfg.panel_of_normals.enabled else None - ) + self.pon_source = self.cfg.panel_of_normals.source self._set_cnvkit_pipeline_logic() @@ -311,16 +307,16 @@ def _set_cnvkit_pipeline_logic(self): Flat: based on targets & antitargets only Cohort: from panel_of_normals step File: from another cohort or public data (reference + target + antitarget [WES only]) - Paired (panel of normal disabled): reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) + Paired: reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) Therefore, a reference must be created for flat & paired choices (one reference per normal sample in the latter case). The logic to create the reference is (panel of normal is pon): - - access created if path_access is missing or average target size estimated + - access created if path_access is missing or average target size must be estimated - average target size estimated if value not in config and dataset is WGS - target created always - antitarget created when dataset is WES """ - self.paired = not self.cfg.panel_of_normals.enabled + self.paired = self.pon_source == PanelOfNormalsOrigin.PAIRED self.build_ref = self.paired or self.pon_source == PanelOfNormalsOrigin.FLAT self.compute_avg_target_size = ( self.is_wgs and self.paired and self.cfg.target.avg_size is None @@ -420,7 +416,7 @@ def get_output_files(self, action: str): "dataframe": self.base_out_lib + "rds", } case "call": - output_files = {"calls": self.base_out_lib + "cns"} + output_files = {"calls": self.base_out_lib + "calls.cns"} case "bintest": output_files = {"tests": self.base_out_lib + "bintest.cns"} case "metrics": @@ -502,7 +498,7 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: result_files = [] - for suffix in ("cnr", "segments.cns", "cns", "bintest.cns"): + for suffix in ("cnr", "segments.cns", "calls.cns", "bintest.cns"): result_files.append(base_out_lib + suffix) actions_to_log = ("fix", "segment", "call", "bintest") @@ -1058,7 +1054,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) "libraryKit", ) - self.matched_normal = self._match_normals() + self.matched_normal = self._match_normals(self.valid_dna_libraries) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -1099,10 +1095,10 @@ def get_result_files(self) -> OutputFiles: return OutputFiles(fns) - def _match_normals(self): + def _match_normals(self, valid_dna_libraries: list[LibraryInfo]) -> dict[str, str]: normals = SomaticCnvCallingWorkflow._split_by( SomaticCnvCallingWorkflow._filter_by( - self.valid_dna_libraries.values(), "is_tumor", lambda x: not x + valid_dna_libraries.values(), "is_tumor", lambda x: not x ), "libraryKit", ) @@ -1121,11 +1117,14 @@ def _match_normals(self): len(normal) < 2 ), f"Muliple valid donor samples for tumor library {sample.library.name}" if normal: + assert ( + normal[0].sex == sample.sex + ), f"Normal & tumor samples {normal[0].library.name} & {sample.library.name} from donor {donor} have different sex" normal_library = normal[0].library matched_normal[sample.library.name] = normal_library.name return matched_normal - def _optionally_register_subworkflow(self, subworkflow): + def _optionally_register_subworkflow(self, subworkflow: str): for tool in set(self.config.tools.wgs + self.config.tools.wes): assert self.config.get(tool) is not None, f"Requested tool '{tool}' not configured" cfg = self.config.get(tool) @@ -1159,7 +1158,7 @@ def _get_dna_libraries(sheet) -> dict[str, LibraryInfo]: purity = bio_sample.extra_infos.get("purity", None) ploidy = bio_sample.extra_infos.get("ploidy", 2) else: - purity = None + purity = 0 ploidy = 2 for test_sample in bio_sample.test_samples.values(): if ( @@ -1189,9 +1188,9 @@ def _get_dna_libraries(sheet) -> dict[str, LibraryInfo]: is_tumor, libraryType, libraryKit, + sex, purity, ploidy, - sex, ) return valid_dna_libraries diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py index f86472ba4..2d3aef2d9 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -1,11 +1,15 @@ import enum import typing from typing import Annotated -from pydantic import Field, model_validator # , validator +from pydantic import Field, model_validator from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel from snappy_pipeline.models.cnvkit import Cnvkit as CnvkitGeneric -from snappy_pipeline.models.library_kit import LibraryKitEntry +from snappy_pipeline.models.purecn import IntervalFilter +from snappy_pipeline.models.purecn import Segmentation as PureCNSegmentation +from snappy_pipeline.models.purecn import PureCN as PureCNBase +from snappy_pipeline.models.purecn import Variant as PureCNVariantParams +from snappy_pipeline.models.common import LibraryKitEntry, Sex class WgsCaller(enum.StrEnum): @@ -33,28 +37,6 @@ class SequencingMethod(enum.StrEnum): WGS = "wgs" -class SexValue(enum.StrEnum): - MALE = "male" - FEMALE = "female" - - -class SexOrigin(enum.StrEnum): - AUTOMATIC = "auto" - SAMPLESHEET = "samplesheet" - CONFIG = "config" - - -class Sex(SnappyModel): - source: SexOrigin = SexOrigin.AUTOMATIC - default: SexValue | None = None - - @model_validator(mode="after") - def ensure_default_value(self): - if self.source == SexOrigin.CONFIG and not self.default: - raise ValueError("Undefined default sex value in configuration file") - return self - - class PanelOfNormalsOrigin(enum.StrEnum): COHORT = "cohort" """Use (& possibly create) a panel of normals from the current cohort of normals in the panel_of_normals step""" @@ -65,23 +47,19 @@ class PanelOfNormalsOrigin(enum.StrEnum): FLAT = "flat" """Use a flat panel of normal (no panel of normals, actually)""" + PAIRED = "paired" + """Use the paired normal as reference (no panel of normal, actually)""" + class PanelOfNormals(SnappyModel): enabled: bool = False """Use panel of normals during CNV calling""" - source: PanelOfNormalsOrigin = PanelOfNormalsOrigin.FILE - """Which type of panel of normals should be used""" - - path_panel_of_normals: str = "" - """ - Path to panel of normals. - - The panel of normals can be either a file (typically from another project, or from the software's own data), - or the path to the pipeline's ```panel_of _normals``` step, depending on the choice of source. + source: PanelOfNormalsOrigin = PanelOfNormalsOrigin.COHORT + """Which type of panel of normals should be used, cohort is generally recommended""" - Note that there is no test that the panel of normals is suitable for that cohort. - """ + path_panel_of_normals: str = "../panel_of_normals" + """Path to panel of normals (used for cohort & file sources)""" @model_validator(mode="after") def ensure_panel_of_normals_path(self): @@ -111,13 +89,22 @@ class Variant(SnappyModel): """Use variants (somatic &/or germline) to improve CNV calling""" source: VariantOrigin = VariantOrigin.FILE - """Where are the variants obrained from""" + """Where are the variants obtained from""" - path_somatic_variant_calling: str = "" + path_somatic_variant_calling: Annotated[ + str, + Field( + examples=[ + "../somatic_variant_calling", + "../somatic_variant_calling_for_CNV", + "/public_data/common_variants.vcf.gz", + ] + ), + ] = "" """ Path to the variants to use for CNV calling. - The path can be either to the ```somatic_variant_calling``` step in the pipeline, if "cohort" is selected, + The path can be either to the ``somatic_variant_calling`` step in the pipeline, if "cohort" is selected, or to the vcf file with the variants when "file" is selected as source. """ @@ -145,7 +132,11 @@ class ControlFreec(SnappyModel): pass -class PureCn(SnappyModel): +class VariantPureCN(Variant, PureCNVariantParams): + pass + + +class PureCN(PureCNBase): panel_of_normals: PanelOfNormals = PanelOfNormals() """ Panel of normals created by the NormalDB.R script. @@ -159,42 +150,16 @@ def restrict_pon_mode(self) -> typing.Self: return self path_target_interval_list_mapping: list[LibraryKitEntry] = [] - """ - Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect) - """ + """Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect)""" - somatic_variant_calling: Variant = Variant() - - mappability: str = "" - """ - GRCh38: - /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw - """ - - reptiming: str = "" - """Nothing for GRCh38""" + sample_sex: Sex = Sex() - seed: int = 1234567 - extra_commands: typing.Dict[str, typing.Any] = { - "model": "betabin", - "fun-segmentation": "PSCBS", - "post-optimize": "", - } - """Recommended extra arguments for PureCN, extra_commands: {} to clear them all""" + somatic_variant_calling: VariantPureCN = VariantPureCN() path_container: Annotated[ str, Field(examples=["../panel_of_normals/work/containers/out/purecn.simg"]) - ] - """Conda installation not working well, container is required""" - - path_intervals: Annotated[ - str, - Field( - examples=[ - "../panel_of_normals/output/purecn/out/_.list" - ] - ), - ] + ] = "" + """Conda installation not working well, container is required. When missing the container is downloaded""" class PurityTool(enum.StrEnum): @@ -240,6 +205,8 @@ def ensure_valid_params_for_source(self): class PanelOfNormalsCnvkit(PanelOfNormals): + enabled: bool = True + """Reset enabled value, cnvkit always needs a panel of normal (even flat or paired)""" path_targets: str | None = None """Path to target file (used only when pon is obtained from file, taken from pipeline step otherwise)""" path_antitargets: str | None = None @@ -247,7 +214,7 @@ class PanelOfNormalsCnvkit(PanelOfNormals): @model_validator(mode="after") def ensure_paths_target_antitarget(self): - if self.enabled and self.source == PanelOfNormalsOrigin.FILE: + if self.source == PanelOfNormalsOrigin.FILE: if self.path_targets is None or self.path_antitargets is None: raise ValueError( "When using a previous pon, target & antitarget files must be defined" @@ -294,6 +261,6 @@ class SomaticCnvCalling(SnappyStepModel): """Tools for WGS & WES data""" cnvkit: Cnvkit | None = None - purecn: PureCn | None = None + purecn: PureCN | None = None sequenza: Sequenza | None = None control_freec: ControlFreec | None = None diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index ca0769c0b..e6bfbd2ae 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -21,8 +21,8 @@ if "variants" in args: variants = r""" ---vcf {args[variants]} \ - --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ - --min-variant-depth {args['min-variant-depth']} {zygocity_freq} + --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ + --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -34,12 +34,12 @@ cmd = r""" cnvkit.py call \ -o {snakemake.output.calls} \ - --method {args['method']} {thresholds} \ + --method {args[method]} {thresholds} \ {filter} \ {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} \ {purity} {ploidy} \ {variants} \ - {args['segments']} + {args[segments]} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index cef6277bd..303127160 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -20,9 +20,9 @@ cmd = r""" cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {args['reference']} \ - --min-mapq {args[min_mapq]} {count} \ - {args['bam']} {args['intervals']} + --fasta {args[reference]} \ + --min-mapq {args[min-mapq]} {count} \ + {args[bam]} {args[intervals]} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 554a87222..a5b56c8e4 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -17,19 +17,25 @@ args = snakemake.params.get("args", {}) +# Fix requires empty antitarget file in WGS & Panel modes +create_dummy_antitarget = "" +if args.get("antitarget", "") == "": + args["antitarget"] = "$TMPDIR/antitarget.bed" + create_dummy_antitarget = f"touch {args['antitarget']} ; " + cmd = r""" cnvkit.py fix \ -o {snakemake.output.ratios} \ - {cluster} --sample-id {args['sample-id']} \ + {cluster} --sample-id {args[sample-id]} \ {no_gc} {no_edge} {no_rmask} \ - {args['target']} {antitarget} {args['reference']} + {args[target]} {args[antitarget]} {args[reference]} """.format( snakemake=snakemake, + args=args, cluster="--cluster" if args.get("cluster", False) else "", no_gc="--no-gc" if args.get("no-gc", False) else "", no_edge="--no-edge" if args.get("no-edge", False) else "", no_rmask="--no-rmask" if args.get("no-rmask", False) else "", - antitarget=f"{args['antitarget']}" if "antitarget" in args else "", ) -CnvkitWrapper(snakemake, cmd).run() +CnvkitWrapper(snakemake, create_dummy_antitarget + cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py index a386a2dec..c4b475f1b 100644 --- a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py @@ -8,7 +8,7 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper @@ -18,8 +18,8 @@ if "variants" in args: variants = r""" ---vcf {args[variants]} \ - --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ - --min-variant-depth {args['min-variant-depth']} {zygocity_freq} + --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ + --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -31,14 +31,14 @@ cmd = r""" cnvkit.py scatter \ -o {snakemake.output.plot} \ - --segment {args['segments']} \ + --segment {args[segments]} \ {chromosome} {gene} {range_list} \ - --width {args['width']} \ - --antitarget-marker {args['antitarget-marker']} --segment-color {args['segment-color']} \ - {by_bin} {trend} --title {args['title']} \ + --width {args[width]} \ + --antitarget-marker {args[antitarget-marker]} --segment-color {args[segment-color]} \ + {by_bin} {trend} --title "{args[title]}" \ {y_min} {y_max} {fig_size} \ {variants} \ - {args['ratios']} + {args[ratios]} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py index 24a3f5a7e..a50a32e7e 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -7,7 +7,7 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper @@ -17,8 +17,8 @@ cmd = r""" cnvkit.py genemetrics \ -o {snakemake.output.report} \ - --segment {args['segments']} \ - --threshold {args['threshold']} --min-probes {args['min-probes']} \ + --segment {args[segments]} \ + --threshold {args[threshold]} --min-probes {args[min-probes]} \ {drop_low_coverage} {male_reference} {sample_sex} {diploid_parx_genome} \ {stats} \ {args[ratios]} diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py index fe421a339..d4a2fdc92 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py @@ -7,23 +7,25 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper args = snakemake.params.get("args", {}) +# As segment files can appear multiple times, the `--segment` argument must be the last one cmd = r""" cnvkit.py metrics \ -o {snakemake.output.report} \ - --segment {args['segments']} \ {drop_low_coverage} \ - {args[ratios]} + {args[ratios]} \ + --segment {segments} """.format( snakemake=snakemake, args=args, drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", + segments=" ".join(args["segments"]), ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py index d569588c2..76c6831e0 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py @@ -7,7 +7,7 @@ # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. -base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..")) sys.path.insert(0, base_dir) from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper @@ -17,8 +17,8 @@ cmd = r""" cnvkit.py segmetrics \ -o {snakemake.output.report} \ - --segment {args['segments']} \ - --alpha {args['alpha']} --bootstrap {args['bootstrap']} {smooth_bootstrap} \ + --segment {args[segments]} \ + --alpha {args[alpha]} --bootstrap {args[bootstrap]} {smooth_bootstrap} \ {drop_low_coverage} \ {stats} \ {args[ratios]} diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 2c02aa03c..240cdfa28 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -16,9 +16,9 @@ if "variants" in args: variants = r""" - ---vcf {args['variants']} \ - --sample-id {args['sample-id']} --normal-id {args['normal-id']} \ - {args['min-variant-depth']} {zygocity_freq} + ---vcf {args[variants]} \ + --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ + {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -30,10 +30,10 @@ cmd = r""" cnvkit.py segment --processes {snakemake.resources._cores} \ -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ - --method {args['method']} --threshold {args['threshold']} {smooth_cbs} \ - {drop_low_coverage} --drop-outliers {args['drop-outliers']} \ + --method {args[method]} --threshold {args[threshold]} {smooth_cbs} \ + {drop_low_coverage} --drop-outliers {args[drop-outliers]} \ {variants} \ - {args[coverage]} + {args[ratios]} """.format( snakemake=snakemake, args=args, diff --git a/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py similarity index 99% rename from tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py rename to tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index de45fb4e1..fe954b333 100644 --- a/tests/snappy_pipeline/workflows/test_workflow_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -56,7 +56,7 @@ def minimal_config(): cnvkit: diploid_parx_genome: GRCh38 panel_of_normals: - enabled: False + source: paired somatic_variant_calling: enabled: True source: cohort @@ -464,7 +464,7 @@ def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segments.cns", "dataframe": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.rds", }, - "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns"}, + "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.calls.cns"}, "bintest": {"tests": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bintest.cns"}, "metrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.metrics.tsv"}, "genemetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.genemetrics.tsv"}, @@ -537,7 +537,7 @@ def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): tpl = "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" expected += [ tpl.format(mapper=mapper, library_name=library_name, ext=ext) - for ext in ("cnr", "segments.cns", "cns", "bintest.cns") + for ext in ("cnr", "segments.cns", "calls.cns", "bintest.cns") for library_name in tumor_libraries for mapper in ("bwa",) ] From 1e434e1d6bd8df41e29212cd35bbcda9c765f4dd Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 28 Nov 2024 18:27:20 +0100 Subject: [PATCH 38/46] refactor: common functions to ignore contigs and for library kits --- snappy_pipeline/models/cnvkit.py | 8 +-- .../workflows/ngs_mapping/model.py | 2 +- snappy_wrappers/tools/genome_windows.py | 62 +++++++++++++++++++ tests/snappy_pipeline/workflows/conftest.py | 13 +++- 4 files changed, 79 insertions(+), 6 deletions(-) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 837f8b984..8f9de72df 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -105,7 +105,7 @@ class Access(SnappyModel): parameter for WGS data, unless the accessible regions are much reduced (for example excluding all intergenic regions, repeats, low complexity, ...) """ - + exclude: list[str] = [] """Regions accessible to mapping""" min_gap_size: int | None = None @@ -115,7 +115,7 @@ class Access(SnappyModel): class Target(SnappyModel): split: bool = True """Split large tiled intervals into smaller, consecutive targets.""" - avg_size: float | None = None + avg_size: int | None = None """ Average size of split target bins (results are approximate). @@ -131,9 +131,9 @@ class Target(SnappyModel): class Antitarget(SnappyModel): - avg_size: float = 150000 + avg_size: int = 150000 """Average size of split antitarget bins (results are approximate)""" - min_size: float | None = None + min_size: int | None = None """Minimum size of antitarget bins (smaller regions are dropped). When missing, 1/16 avg size""" diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index b6e05bada..a2641993b 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -7,7 +7,7 @@ from pydantic import Field, field_validator, model_validator from snappy_pipeline.models import EnumField, SizeString, SnappyModel, SnappyStepModel -from snappy_pipeline.models.library_kit import LibraryKit +from snappy_pipeline.models.common import LibraryKit class DnaMapper(Enum): diff --git a/snappy_wrappers/tools/genome_windows.py b/snappy_wrappers/tools/genome_windows.py index 906d32dc7..2236d6c42 100644 --- a/snappy_wrappers/tools/genome_windows.py +++ b/snappy_wrappers/tools/genome_windows.py @@ -12,8 +12,12 @@ import csv import fnmatch import os +import re import sys +from pathlib import Path + + # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its # own conda environment which cannot see the snappy_pipeline installation. @@ -32,6 +36,12 @@ #: Allowed values for ``--format`` CHOICES_FORMAT = ("regions", "bed") +#: Regular expression patterns to parse *.fai, *.genome, *.dict & fastq files +PATTERN_FAI = re.compile(r"^([^\s]+)\t([0-9]+)\t([0-9]+)\t([0-9]+)\t([0-9]+)\s*$") +PATTERN_GENOME = re.compile(r"^([^\s]+)\t([0-9]+)\s*$") +PATTERN_DICT = re.compile(r"^@SQ\tSN:([^\s]+)\tLN:([0-9]+).*$") +PATTERN_FASTA = re.compile(r"^\s*>\s*([^\s]+).*$") + def matches_any(query, patterns): for pattern in patterns: @@ -78,6 +88,58 @@ def yield_regions(fai_file, window_size, subtract_end=0, ignore_chroms=None, pad begin = end +def ignore_chroms(path_ref: str, ignored: set[str] = [], return_ignored: bool = False): + path_ref = Path(path_ref).resolve() + if Path(str(path_ref) + ".fai").exists(): + contigs = _parse_index(Path(str(path_ref) + ".fai"), PATTERN_FAI) + elif Path(str(path_ref) + ".genome").exists(): + contigs = _parse_index(Path(str(path_ref) + ".genome"), PATTERN_GENOME) + elif path_ref.with_suffix("dict").exists(): + contigs = _parse_index(path_ref.with_suffix("dict"), PATTERN_DICT, True) + else: + contigs = _read_fasta(path_ref) + for contig_name, contig_length in contigs: + m = matches_any(contig_name, ignored) + if (m and return_ignored) or (not m and not return_ignored): + yield contig_name, contig_length + + +def _parse_index(filename: Path, pattern: re.Pattern, allow_mismatch: bool = False): + with open(filename, "rt") as f: + for line in f: + line = line.strip() + if len(line) == 0 or line.startswith("#"): + continue + m = pattern.match(line) + if m: + groups = m.groups() + yield groups[0], int(groups[1]) + else: + if not allow_mismatch: + raise ValueError(f"Unexpected record '{line}' in reference file '{filename}'") + + +def _read_fasta(filename: Path): + contig_name = None + contig_length = None + with open(filename, "rt") as f: + for line in f: + line = line.strip() + if len(line) == 0 or line.startswith("#"): + continue + m = PATTERN_FASTA.match(line) + if m: + if contig_name: + yield contig_name, contig_length + groups = m.groups() + contig_name = groups[0] + contig_length = 0 + else: + contig_length += len(line) + assert contig_name is not None, f"No contig found in reference file {filename}" + yield contig_name, contig_length + + def run(args): """Main entry point after parsing command line arguments""" yielded = 0 diff --git a/tests/snappy_pipeline/workflows/conftest.py b/tests/snappy_pipeline/workflows/conftest.py index 4e7d9de5c..dabd97704 100644 --- a/tests/snappy_pipeline/workflows/conftest.py +++ b/tests/snappy_pipeline/workflows/conftest.py @@ -907,7 +907,7 @@ def cancer_sheet_fake_fs_path_link_in(fake_fs, cancer_sheet_tsv): @pytest.fixture -def autobin_result_fake_fs(fake_fs, cancer_sheet_tsv): +def autobin_result_calling_fake_fs(fake_fs, cancer_sheet_tsv): """Return fake file autobin.txt""" # Create work directory fake_fs.fs.makedirs("/work", exist_ok=True) @@ -923,6 +923,17 @@ def autobin_result_fake_fs(fake_fs, cancer_sheet_tsv): return fake_fs +@pytest.fixture +def autobin_result_pon_fake_fs(fake_fs, cancer_sheet_tsv): + """Return fake file autobin.txt""" + # Create work directory + fake_fs.fs.makedirs("/work", exist_ok=True) + # Create autobin result for the samples + tpl = "/{mapper}.cnvkit/out/{mapper}.cnvkit.autobin.txt" + fake_fs.fs.create_file(tpl.format(mapper="bwa"), create_missing_dirs=True) + return fake_fs + + @pytest.fixture def purity_result_fake_fs(fake_fs, cancer_sheet_tsv): """Return fake file purity.txt""" From b2be0a11d2c1a4f1c8f8794d74e62d1a71dde688 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 28 Nov 2024 18:30:31 +0100 Subject: [PATCH 39/46] feat: improved logic for cnvkit panel of normals & somatic cnv calling, with updated wrappers --- .../workflows/panel_of_normals/Snakefile | 52 +- .../workflows/panel_of_normals/__init__.py | 669 +++++++++--------- .../workflows/panel_of_normals/model.py | 133 +--- .../workflows/somatic_cnv_calling/__init__.py | 52 +- .../workflows/somatic_cnv_calling/model.py | 8 +- .../wrappers/cnvkit/access/wrapper.py | 15 +- .../wrappers/cnvkit/antitarget/wrapper.py | 4 +- .../wrappers/cnvkit/autobin/wrapper.py | 4 +- .../wrappers/cnvkit/call/wrapper.py | 3 +- .../wrappers/cnvkit/reference/wrapper.py | 6 +- .../cnvkit/report/genemetrics/wrapper.py | 4 +- .../wrappers/cnvkit/sex/environment.yaml | 1 + .../wrappers/cnvkit/sex/wrapper.py | 33 + .../wrappers/cnvkit/target/wrapper.py | 4 +- .../test_workflows_panel_of_normals.py | 575 +++++---------- .../test_workflows_somatic_cnv_calling.py | 23 +- 16 files changed, 670 insertions(+), 916 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/sex/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/sex/wrapper.py diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 923431644..e311589c3 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -101,30 +101,10 @@ rule panel_of_normals_mutect2_create_panel: # Panel of normals (cnvkit) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Write out access file (if required, must be run prior to the cnvkit panel of normals) - - -rule panel_of_normals_access_run: - output: - **wf.get_output_files("access", "run"), - resources: - time=wf.get_resource("access", "run", "time"), - memory=wf.get_resource("access", "run", "memory"), - partition=wf.get_resource("access", "run", "partition"), - log: - **wf.get_log_file("access", "run"), - params: - **wf.get_args("access", "run"), - wrapper: - wf.wrapper_path("cnvkit/access") - - # Write out the normals-only results for the normals -------------------------- rule panel_of_normals_cnvkit_access: - input: - unpack(wf.get_input_files("cnvkit", "access")), output: **wf.get_output_files("cnvkit", "access"), threads: wf.get_resource("cnvkit", "access", "threads") @@ -233,22 +213,22 @@ rule panel_of_normals_cnvkit_create_panel: wf.wrapper_path("cnvkit/reference") -# rule panel_of_normals_cnvkit_report: -# input: -# unpack(wf.get_input_files("cnvkit", "report")), -# output: -# **wf.get_output_files("cnvkit", "report"), -# threads: wf.get_resource("cnvkit", "report", "threads") -# resources: -# time=wf.get_resource("cnvkit", "report", "time"), -# memory=wf.get_resource("cnvkit", "report", "memory"), -# partition=wf.get_resource("cnvkit", "report", "partition"), -# log: -# **wf.get_log_file("cnvkit", "report"), -# params: -# **{"args": wf.get_args("cnvkit", "report")}, -# wrapper: -# wf.wrapper_path("cnvkit/report") +rule panel_of_normals_cnvkit_sex: + input: + unpack(wf.get_input_files("cnvkit", "sex")), + output: + **wf.get_output_files("cnvkit", "sex"), + threads: wf.get_resource("cnvkit", "sex", "threads") + resources: + time=wf.get_resource("cnvkit", "sex", "time"), + memory=wf.get_resource("cnvkit", "sex", "memory"), + partition=wf.get_resource("cnvkit", "sex", "partition"), + log: + **wf.get_log_file("cnvkit", "sex"), + params: + **{"args": wf.get_args("cnvkit", "sex")}, + wrapper: + wf.wrapper_path("cnvkit/sex") # Panel of normals (PureCN) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 240643323..b919fd480 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -169,9 +169,12 @@ """ +import re + from enum import StrEnum from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions +from snakemake.io import Wildcards, InputFiles from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -182,12 +185,16 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_pipeline.models.common import SexOrigin, SexValue + from .model import PanelOfNormals as PanelOfNormalsConfigModel +from .model import PureCn as PureCnConfig +from .model import CnvKit as CnvkitConfig __author__ = "Manuel Holtgrewe " #: Names of the tools that might use panel of normals -TOOLS = ("mutect2", "cnvkit", "access", "purecn") +TOOLS = ("mutect2", "cnvkit", "purecn") #: Default configuration for the somatic_variant_calling schema DEFAULT_CONFIG = PanelOfNormalsConfigModel.default_config_yaml_string() @@ -214,18 +221,26 @@ def __init__(self, parent): super().__init__(parent) # Build shortcut from cancer bio sample name to matched cancer sample known_libraries = self._get_normal_libraries() - self.normal_libraries = list(known_libraries.keys()) + self.normal_libraries = known_libraries if self.name and (cfg := self.config.get(self.name)): if path := cfg.get("path_normals_list"): - self.normal_libraries = [] + self.normal_libraries = {} with open(path, "rt") as f: for line in f: if line.startswith("#"): continue - self.normal_libraries.append(line.strip()) - self.libraryType, self.libraryKit = self._validate_normal_libraries(known_libraries) - - def _get_normal_libraries(self): + library_name = line.strip() + assert ( + line in known_libraries.keys() + ), f"Unknown requested library {library_name}" + self.normal_libraries[library_name] = known_libraries[library_name] + self.libraryType, self.libraryKit = self._validate_normal_libraries() + + self.ignored = [] + if len(self.config.get("ignore_chroms", [])) > 0: + self.ignored += self.config.ignore_chroms + + def _get_normal_libraries(self) -> dict[str, dict[str, str]]: normal_libraries = {} for sheet in self.parent.shortcut_sheets: for donor in sheet.donors: @@ -239,34 +254,36 @@ def _get_normal_libraries(self): normal_libraries[library.name] = self._get_extra_info(library) return normal_libraries - def _validate_normal_libraries(self, known_libraries): + def _validate_normal_libraries(self) -> tuple[str, str]: + libraries = self.normal_libraries libraryType = None libraryKit = None - for library in self.normal_libraries: - assert ( - library in known_libraries - ), f"Unknown normal library {library} requested to build pon" + for library in libraries: assert ( - libraryType is None or libraryType == known_libraries[library]["libraryType"] + libraryType is None or libraryType == libraries[library]["libraryType"] ), "Panel of normal cannot be built from multiple library types" - libraryType = known_libraries[library]["libraryType"] + libraryType = libraries[library]["libraryType"] if libraryType == LibraryType.WES: assert ( - libraryKit is None or libraryKit == known_libraries[library]["libraryKit"] + libraryKit is None or libraryKit == libraries[library]["libraryKit"] ), "Panel of normal cannot be built from multiple library kits" - libraryKit = known_libraries[library]["libraryKit"] + libraryKit = libraries[library]["libraryKit"] return (libraryType, libraryKit) @staticmethod - def _get_extra_info(library): + def _get_extra_info(library) -> dict[str, str]: extra_info = {} assert "libraryType" in library.extra_infos, f"Undefined type of library {library.name}" - extra_info["libraryType"] = library.extra_infos.get("libraryType", "Illumina") + assert ( + library.extra_infos.get("libraryType") in LibraryType + ), f"Unknown library type {library.extra_infos.get('libraryType')}" + extra_info["libraryType"] = library.extra_infos.get("libraryType") if extra_info["libraryType"] == LibraryType.WES: assert ( "libraryKit" in library.extra_infos ), f"Undefined exome kit for library {library.name}" extra_info["libraryKit"] = library.extra_infos.get("libraryKit", "__default__") + extra_info["sex"] = library.parent.parent.parent.extra_infos.get("sex", None) return extra_info @staticmethod @@ -292,26 +309,10 @@ class PureCnStepPart(PanelOfNormalsStepPart): #: Resources resource_usage = { - "install": ResourceUsage( - threads=1, - time="01:00:00", - memory="24G", - ), - "prepare": ResourceUsage( - threads=1, - time="04:00:00", # 4 hours - memory="24G", - ), - "coverage": ResourceUsage( - threads=1, - time="04:00:00", # 4 hours - memory="24G", - ), - "create_panel": ResourceUsage( - threads=1, - time="12:00:00", # 12 hours - memory="32G", - ), + "install": ResourceUsage(threads=1, time="01:00:00", memory="24G"), + "prepare": ResourceUsage(threads=1, time="04:00:00", memory="24G"), + "coverage": ResourceUsage(threads=1, time="04:00:00", memory="24G"), + "create_panel": ResourceUsage(threads=1, time="12:00:00", memory="32G"), } def get_input_files(self, action): @@ -330,7 +331,7 @@ def _get_input_files_coverage(self, wildcards): yield ( "intervals", "work/purecn/out/{}_{}.list".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ), ) @@ -345,7 +346,7 @@ def _get_input_files_create(self, wildcards): "normals", [ tpl.format(mapper=wildcards.mapper, library_name=lib) - for lib in self.normal_libraries + for lib in self.normal_libraries.keys() ], ) @@ -358,7 +359,7 @@ def get_output_files(self, action): return {"container": "work/containers/out/purecn.simg"} if action == "prepare": base_out = "{}_{}".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ) return { @@ -391,7 +392,7 @@ def get_log_file(self, action): tpls = { "install": "work/containers/log/purecn", "prepare": "work/purecn/log/{}_{}".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ), "coverage": "work/{mapper}.purecn/log/{mapper}.purecn.{library_name,.+-DNA[0-9]+-WES[0-9]+}", @@ -412,16 +413,8 @@ class Mutect2StepPart(PanelOfNormalsStepPart): #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). resource_usage = { - "prepare_panel": ResourceUsage( - threads=2, - time="3-00:00:00", # 3 days - memory="8G", - ), - "create_panel": ResourceUsage( - threads=2, - time="48:00:00", # 48 hours - memory="30G", - ), + "prepare_panel": ResourceUsage(threads=2, time="3-00:00:00", memory="8G"), + "create_panel": ResourceUsage(threads=2, time="48:00:00", memory="30G"), } def check_config(self): @@ -454,7 +447,7 @@ def _get_input_files_create_panel(self, wildcards): """Helper wrapper function for merging individual results & panel creation""" paths = [] tpl = "work/{mapper}.{tool}/out/{mapper}.{tool}.{normal_library}.prepare.vcf.gz" - for normal in self.normal_libraries: + for normal in self.normal_libraries.keys(): paths.append(tpl.format(normal_library=normal, tool=self.name, **wildcards)) return {"normals": paths} @@ -503,318 +496,347 @@ class CnvkitStepPart(PanelOfNormalsStepPart): "antitarget", "coverage", "create_panel", - "report", + "sex", ) - #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). - resource_usage = { - "target": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="8G", - ), - "antitarget": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="8G", - ), - "coverage": ResourceUsage( - threads=8, - time="02:00:00", # 2 hours - memory="16G", - ), - "create_panel": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="16G", - ), - "report": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="16G", - ), - } + # Overwrite defaults + default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h + resource_usage = {"coverage": ResourceUsage(threads=8, time="11:59:59", memory="7680M")} def __init__(self, parent): super().__init__(parent) - def check_config(self): - if self.name not in self.config.tools: - return # cnvkit not enabled, skip - self.parent.ensure_w_config( - ("static_data_config", "reference", "path"), - "Path to reference FASTA not configured but required for %s" % (self.name,), + if self.name in self.config.tools: + self.is_wgs = self.libraryType == LibraryType.WGS + self.is_wes = self.libraryType == LibraryType.WES + + self.cfg: CnvkitConfig = self.config.get(self.name) + + self.ignored += self.cfg.ignore_chroms + self.ignored = set(self.ignored) + + self._set_cnvkit_pipeline_logic() + + self.path_baits = self._get_path_baits() + + self.base_out = "work/{mapper}.cnvkit/out/{mapper}.cnvkit." + self.base_out_lib = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." + ) + + def _set_cnvkit_pipeline_logic(self): + """ + Creates instance variables to choose path in cnvkit pipeline + + Access: regions accessible for CNV calling (unmasked) + path_access or when missing build from genome reference + optional list of excluded region + + Target: regions of good coverage + From baits (WES) or accessible regions (WGS) + estimate of target size from config or autobin step + + Antitarget: regions of low coverage + antitarget = access - target, only WES, otherwise empty + + Reference: + Flat: based on targets & antitargets only + Cohort: from panel_of_normals step + File: from another cohort or public data (reference + target + antitarget [WES only]) + Paired: reference built from the target & antitarget coverage of one normal sample only (paired with the tumor) + """ + self.compute_avg_target_size = self.is_wgs and self.cfg.target.avg_size is None + self.create_access = not self.cfg.path_access + self.plain_access = ( + not self.cfg.path_access + and len(self.cfg.access.exclude) == 0 + and self.cfg.access.min_gap_size is None ) + def _get_cohort_sex(self) -> SexValue | None: + match self.cfg.sample_sex.source: + case SexOrigin.CONFIG: + return self.cfg.sample_sex.default + case SexOrigin.AUTOMATIC: + return None + case SexOrigin.SAMPLESHEET: + sex = None + for library, extra_info in self.normal_libraries.items(): + if extra_info.get("sex", None) is None: + assert sex is None, f"Sex of library {library} not defined in samplesheet" + else: + if sex is None: + sex = SexValue(extra_info.get("sex")) + else: + assert sex == SexValue( + extra_info.get("sex") + ), "Multiple sex in the cohort, use 'auto' in sex source" + return sex + + def _get_path_baits(self) -> str | None: + if not self.is_wes: + return None + default = None + for item in self.cfg.path_target_interval_list_mapping: + if item.name == self.libraryKit: + return item.path + elif item.name == "__default__": + default = item.path + if default is None: + raise ValueError(f"Missing library kit definition for {self.libraryKit}") + return default + def get_input_files(self, action): """Return input files for cnvkit panel of normals creation""" # Validate action self._validate_action(action) - mapping = { - "access": self._get_input_files_access, - "autobin": self._get_input_files_autobin, - "target": self._get_input_files_target, - "antitarget": self._get_input_files_antitarget, - "coverage": self._get_input_files_coverage, - "create_panel": self._get_input_files_create_panel, - } - return mapping[action] + return getattr(self, "_get_input_files_{}".format(action.replace("/", "_"))) def get_args(self, action): - """Return panel of normal files""" - if action == "access": - return self._get_args_access - elif action == "autobin": - return self._get_args_autobin - elif action == "target": - return self._get_args_target - elif action == "antitarget": - return self._get_args_antitarget - elif action == "coverage": - return self._get_args_coverage - elif action == "create_panel": - return self._get_args_create_panel - else: - self._validate_action(action) + """Return parameters input function, dependent on rule""" + # Validate action + self._validate_action(action) + return getattr(self, "_get_args_{}".format(action.replace("/", "_"))) + @dictify def get_output_files(self, action): - """Return panel of normal files""" - output_files = None - if action == "access": - output_files = self._get_output_files_access() - elif action == "autobin": - output_files = self._get_output_files_autobin() - elif action == "target": - output_files = self._get_output_files_target() - elif action == "antitarget": - output_files = self._get_output_files_antitarget() - elif action == "coverage": - output_files = self._get_output_files_coverage() - elif action == "create_panel": - output_files = self._get_output_files_create_panel() + """Return panel of normal output files""" + self._validate_action(action) + output_files = {} + match action: + case "access": + output_files = {"access": self.base_out + "access.bed"} + case "autobin": + output_files = {"result": self.base_out + "autobin.txt"} + case "target": + output_files = {"target": self.base_out + "target.bed"} + case "antitarget": + output_files = {"antitarget": self.base_out + "antitarget.bed"} + case "coverage": + output_files = {"coverage": self.base_out_lib + "{region,(target|antitarget)}.cnn"} + case "create_panel": + output_files = {"reference": self.base_out + "panel_of_normals.cnn"} + case "sex": + output_files = {"sex": self.base_out + "sex.tsv"} + + for k, v in output_files.items(): + yield k, v + yield k + "_md5", v + ".md5" + + @dictify + def get_log_file(self, action): + """Return panel of normal log files""" + # Validate action + self._validate_action(action) + + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." + base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." + if action in ("access", "autobin", "target", "antitarget", "create_panel", "sex"): + tpl = base_log + action + elif action in ("coverage",): + tpl = base_log_lib + "{region,(target|antitarget)}" else: - self._validate_action(action) - return dict( - zip( - list(output_files.keys()) + [k + "_md5" for k in output_files.keys()], - list(output_files.values()) + [v + ".md5" for v in output_files.values()], - ) - ) + raise ValueError(f"Logs of action '{action}' not implemented yet") - @classmethod - def get_log_file(cls, action): - """Return panel of normal files""" - tpls = { - "access": "work/{mapper}.cnvkit/log/cnvkit.access", - "autobin": "work/{mapper}.cnvkit/log/cnvkit.autobin", - "target": "work/{mapper}.cnvkit/log/cnvkit.target", - "antitarget": "work/{mapper}.cnvkit/log/cnvkit.antitarget", - "coverage": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage", - "create_panel": "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals", - } - assert action in cls.actions - return cls._get_log_file(tpls[action], has_sh=True) + for key, ext in ( + ("conda_list", ".conda_list.txt"), + ("conda_info", ".conda_info.txt"), + ("log", ".log"), + ("sh", ".sh"), + ): + yield key, tpl + ext + yield key + "_md5", tpl + ext + ".md5" + + @listify + def get_result_files(self) -> list[str]: + if self.name not in self.config.tools: + return [] + + result_files = [] + + result_files += list(self.get_output_files("create_panel").values()) + result_files += list(self.get_log_file("create_panel").values()) + + result_files += list(self.get_output_files("target").values()) + result_files += list(self.get_log_file("target").values()) + + if self.libraryType == LibraryType.WES: + result_files += list(self.get_output_files("antitarget").values()) + result_files += list(self.get_log_file("antitarget").values()) + + result_files += list(self.get_output_files("sex").values()) + result_files += list(self.get_log_file("sex").values()) - def _get_input_files_access(self, wildcards): + return filter(lambda x: not x.endswith(".md5"), result_files) + + def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "Access shouldn't be created, already available" return {} - def _get_args_access(self, wildcards): - return { + def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "Access shouldn't be created, already available" + return dict(input) | { "reference": self.w_config.static_data_config.reference.path, - "min_gap_size": self.config.cnvkit.min_gap_size, + "min-gap-size": self.cfg.access.min_gap_size, + "exclude": self.cfg.access.exclude, + "ignore_chroms": list(self.ignored), } - def _get_output_files_access(self): - return {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"} - - def _get_input_files_autobin(self, wildcards): + def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: assert ( - self.libraryType == LibraryType.WGS + self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" - return { + input_files = { "bams": [ ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) - for x in self.normal_libraries - ], - "access": "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards), + for x in self.normal_libraries.keys() + ] } + if self.create_access: + if self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + return input_files - def _get_args_autobin(self, wildcards): + def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert ( - self.libraryType == LibraryType.WGS + self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" - return {"method": "wgs", "bp_per_bin": 50000} - - def _get_output_files_autobin(self): - return {"result": "work/{mapper}.cnvkit/out/cnvkit.autobin.txt"} + args = dict(input) | {"bp-per-bin": 50000} + if self.plain_access: + args["method"] = "wgs" + else: + args["method"] = "amplicon" + if "target" not in args: + args["target"] = self.cfg.path_access + return args - def _get_input_files_target(self, wildcards): + def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: """Helper wrapper function to estimate target average size in wgs mode""" input_files = {} - if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") == "": - input_files["access"] = "work/{mapper}.cnvkit/out/cnvkit.access.bed".format(**wildcards) - if self.config.cnvkit.get("target_avg_size", None) is None: - input_files["avg_size"] = "work/{mapper}.cnvkit/out/cnvkit.autobin.txt".format( - **wildcards - ) + if self.is_wgs: + if self.create_access: + input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" + if self.compute_avg_target_size: + input_files["avg-size"] = self.base_out.format(**wildcards) + "autobin.txt" return input_files - def _get_args_target(self, wildcards): - params = {} - if self.name in self.config.tools: - if self.libraryType == LibraryType.WES: - params["target"] = self.config.cnvkit.path_target_regions - if self.libraryType == LibraryType.WGS and self.config.cnvkit.get("access", "") != "": - params["target"] = self.config.cnvkit.get("access") - if self.w_config.static_data_config.get("features", None): - params["annotate"] = self.w_config.static_data_config.features.path - if self.config.cnvkit.get("split", True): - params["split"] = True - if self.config.cnvkit.get("target_avg_size", None): - params["avg_size"] = self.config.cnvkit.get("target_avg_size") - return params - - def _get_output_files_target(self): - return {"target": "work/{mapper}.cnvkit/out/cnvkit.target.bed"} - - def _get_input_files_antitarget(self, wildcards): - """Helper wrapper function for computing antitarget locations""" - if self.libraryType == LibraryType.WGS: - return {} - return { - "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed".format(**wildcards), - } - - def _get_args_antitarget(self, wildcards): - params = {} - if self.name in self.config.tools: - params = { - "avg_size": self.config.cnvkit.antitarget_avg_size, - "min_size": self.config.cnvkit.min_size, + def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + if self.libraryType == LibraryType.WES: + args = { + "avg-size": self.cfg.target.avg_size, + "split": self.cfg.target.split, + "interval": self.path_baits, } - if self.config.cnvkit.get("access", "") != "": - params["access"] = self.config.cnvkit.get("access") - return params + else: + assert self.is_wgs, "Panel not implemented yet" + args = dict(input) | {"split": self.cfg.target.split} + if args.get("avg-size", None) is not None: + args["avg-size"] = self._read_autobin_output(args["avg-size"]) + elif self.cfg.target.avg_size is not None: + args["avg-size"] = self.cfg.target.avg_size + else: + args["avg-size"] = 5000 + if self.w_config.static_data_config.get("features", None): + args["annotate"] = self.w_config.static_data_config.features.path + args["short-names"] = self.cfg.target.short_names + return args + + def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: + input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} + if self.create_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + return input_files - def _get_output_files_antitarget(self): - return {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"} + def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "avg-size": self.cfg.antitarget.avg_size, + "min-size": self.cfg.antitarget.min_size, + } + if "access" not in args: + args["access"] = self.cfg.path_access + return args - def _get_input_files_coverage(self, wildcards): + def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: """Helper wrapper function for computing coverage""" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam" bam = ngs_mapping(tpl.format(**wildcards)) return { - "intervals": "work/{mapper}.cnvkit/out/cnvkit.{interval}.bed".format(**wildcards), + "intervals": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{region}.bed".format( + **wildcards + ), "bam": bam, "bai": bam + ".bai", } - def _get_args_coverage(self, wildcards): - params = {} - if self.name in self.config.tools: - params = { - "reference": self.w_config.static_data_config.reference.path, - "min_mapq": self.config.cnvkit.min_mapq, - } - if self.config.cnvkit.get("count", False): - params["count"] = True - return params - - def _get_output_files_coverage(self): - return { - "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", + def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "min-mapq": self.cfg.coverage.min_mapq, + "count": self.cfg.coverage.count, } - def _get_input_files_create_panel(self, wildcards): - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" + def _get_input_files_create_panel(self, wildcards: Wildcards) -> dict[str, str]: + tpl = self.base_out_lib + "target.cnn" targets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() ] if self.libraryType == LibraryType.WES: - tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + tpl = self.base_out_lib + "antitarget.cnn" antitargets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) - for x in self.normal_libraries + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() ] else: antitargets = [] return {"normals": targets + antitargets} - def _get_args_create_panel(self, wildcards): - params = {} - if self.name in self.config.tools: - params = { - "reference": self.w_config.static_data_config.reference.path, - } - if self.config.cnvkit.get("cluster", False): - params["cluster"] = True - params["min_cluster_size"] = self.config.cnvkit.min_cluster_size - if self.config.cnvkit.get("sample_sex"): - params["sample_sex"] = self.config.cnvkit.sample_sex - if self.config.cnvkit.get("male_reference", False): - params["male_reference"] = True - if self.config.cnvkit.get("diploid_parx_genome", None): - params["diploid_parx_genome"] = self.config.cnvkit.get("diploid_parx_genome") - if not self.config.cnvkit.get("gc_correction", True): - params["no_gc"] = True - if not self.config.cnvkit.get("rmask_correction", True): - params["no_rmask"] = True - if self.config.cnvkit.get("edge_correction", None) is None: - if self.libraryType != LibraryType.WES: - params["no_edge"] = True - elif not self.config.cnvkit.get("edge_correction"): - params["no_edge"] = True - return params - - def _get_output_files_create_panel(self): - return {"panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"} - - -class AccessStepPart(PanelOfNormalsStepPart): - """Utility to create access file for cnvkit""" - - name = "access" - actions = ("run",) - - def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: - # Validate action - self._validate_action(action) - return ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="8G", - ) - - def get_input_files(self, action): - # Validate action - self._validate_action(action) - return None - - def get_output_files(self, action): - # Validate action - self._validate_action(action) - tpl = "work/access/out/access.bed" - return {"access": tpl, "access_md5": tpl + ".md5"} + def _get_args_create_panel(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + args = dict(input) | { + "reference": self.w_config.static_data_config.reference.path, + "cluster": self.cfg.cluster, + "no-gc": not self.cfg.gc, + "no-rmask": not self.cfg.rmask, + "no-edge": not self.cfg.get("edge", self.is_wes), + "diploid-parx-genome": self.cfg.diploid_parx_genome, + } + if self.cfg.cluster: + args["min-cluster-size"] = self.cfg.min_cluster_size + sample_sex = self._get_cohort_sex() + if sample_sex is not None: + args["sample-sex"] = str(sample_sex) + args["male-reference"] = self.cfg.male_reference + return args + + def _get_input_files_sex(self, wildcards: Wildcards) -> dict[str, str]: + tpl = self.base_out_lib + "target.cnn" + coverages = [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + if self.is_wes: + tpl = self.base_out_lib + "antitarget.cnn" + coverages += [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + return {"coverages": coverages} - def get_args(self, action): - # Validate action - self._validate_action(action) - if self.name in self.config.tools: - return { - "reference": self.w_config.static_data_config.reference.path, - "min_gap_size": self.config.access.min_gap_size, - "exclude": self.config.access.exclude, - } - return {} + def _get_args_sex(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return dict(input) | {"diploid-parx-genome": self.cfg.diploid_parx_genome} - @classmethod - def get_log_file(cls, action): - """Return log files""" - assert action in cls.actions - return cls._get_log_file("work/access/log/access", has_sh=True) + def _read_autobin_output(self, filename: str) -> int: + nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" + pattern = re.compile("^Target:[ \t]+" + nb + "[ \t]+" + nb + "$") + with open(filename) as f: + for line in f: + m = pattern.match(line) + if m: + return int(float(m.groups()[4])) + return -1 class PanelOfNormalsWorkflow(BaseStep): @@ -852,7 +874,6 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ( Mutect2StepPart, CnvkitStepPart, - AccessStepPart, PureCnStepPart, LinkOutStepPart, ) @@ -879,39 +900,10 @@ def get_result_files(self): result_files.extend(self._expand_result_files(tpl, log_ext_list)) if "cnvkit" in set(self.config.tools) & set(TOOLS): - tpls = [ - ("output/{mapper}.cnvkit/out/cnvkit.target.{ext}", ("bed",)), - ("output/{mapper}.cnvkit/out/cnvkit.antitarget.{ext}", ("bed",)), - ( - "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", - ("cnn",), - ), - # ( - # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", - # ("tsv", "tsv.md5"), - # ), - # ( - # "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", - # ("tsv", "tsv.md5"), - # ), - ] - for tpl, ext_list in tpls: - result_files.extend(self._expand_result_files(tpl, ext_list)) - tpls = [ - "output/{mapper}.cnvkit/log/cnvkit.target.{ext}", - "output/{mapper}.cnvkit/log/cnvkit.antitarget.{ext}", - "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", - ] - for tpl in tpls: - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) - # tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" - # result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) - - if "access" in set(self.config.tools) & set(TOOLS): - tpl = "output/access/out/access.bed" - result_files.extend([tpl + md5 for md5 in ("", ".md5")]) - tpl = "output/access/log/access.{ext}" - result_files.extend(self._expand_result_files(tpl, log_ext_list + ["sh"])) + cnvkit_files = self.sub_steps["cnvkit"].get_result_files() + for work in cnvkit_files: + output = work.replace("work/", "output/", 1) + result_files.extend(self._expand_result_files(output, ("",))) if "purecn" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.{ext}" @@ -923,13 +915,14 @@ def get_result_files(self): tpl = "output/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) tpl = "output/purecn/out/{}_{}.{{ext}}".format( - self.config.purecn.enrichment_kit_name, + # TODO: select enrichment kit + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ) ext_list = ("list", "bed.gz", "bed.gz.tbi") result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/purecn/log/{}_{}.{{ext}}".format( - self.config.purecn.enrichment_kit_name, + self.config.purecn.path_target_interval_list_mapping[0].name, self.config.purecn.genome_name, ) result_files.extend(self._expand_result_files(tpl, log_ext_list)) diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index 802b5e417..59684d40d 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -3,7 +3,10 @@ from pydantic import Field -from snappy_pipeline.models import EnumField, KeepTmpdir, SnappyModel, SnappyStepModel, validators +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, Parallel, validators +from snappy_pipeline.models.common import LibraryKitEntry, Sex +from snappy_pipeline.models.cnvkit import CnvkitToReference as CnvkitGeneric +from snappy_pipeline.models.mutect2 import Mutect2 as Mutect2Generic class Tool(enum.StrEnum): @@ -13,122 +16,32 @@ class Tool(enum.StrEnum): access = "access" -class Mutect2(SnappyModel): +class Mutect2(Parallel, Mutect2Generic): path_normals_list: str = "" - - germline_resource: str + """Optional file listing libraries to include in panel""" java_options: str = " -Xmx16g" + """Optional java run-time options""" - num_cores: int = 2 - """number of cores to use locally""" - - window_length: int = 100000000 - """split input into windows of this size, each triggers a job""" - - num_jobs: int = 500 - """number of windows to process in parallel""" - - use_profile: bool = True - """use Snakemake profile for parallel processing""" - - restart_times: int = 5 - """number of times to re-launch jobs in case of failure""" - - max_jobs_per_second: int = 2 - """throttling of job creation""" - - max_status_checks_per_second: int = 10 - """throttling of status checks""" - - debug_trunc_tokens: int = 0 - """truncation to first N tokens (0 for none)""" - - keep_tmpdir: KeepTmpdir = KeepTmpdir.never - """keep temporary directory, {always, never, onerror}""" - job_mult_memory: float = 1 - """memory multiplier""" - - job_mult_time: float = 1 - """running time multiplier""" - - merge_mult_memory: float = 1 - """memory multiplier for merging""" - - merge_mult_time: float = 1 - """running time multiplier for merging""" - - -class CnvkitSex(enum.StrEnum): - MALE = "male" - FEMALE = "female" - - -class CnvKit(SnappyModel): - path_normals_list: str = "" +class CnvKit(CnvkitGeneric): + path_normals_list: str | None = None """Optional file listing libraries to include in panel""" - path_target_regions: str = "" - """Bed files of targetted regions (Missing when creating a panel of normals for WGS data)""" - - access: str = "" - """Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run)""" - - min_gap_size: int = 5000 - """[access] Minimum gap size between accessible regions""" - - target_avg_size: int | None = None - """[target] Average size of split target bins (None: use default value, or use autobin for wgs)""" - - split: bool = True - """[target] Split large intervals into smaller ones""" - - bp_per_bin: int = 50000 - """[autobin] Expected base per bin""" - - antitarget_avg_size: int = 0 - """[antitarget] Average size of antitarget bins (0: use default value)""" - - min_size: int = 0 - """[antitarget] Min size of antitarget bins (0: use default value)""" - - min_mapq: int = 0 - """[coverage] Mininum mapping quality score to count a read for coverage depth""" - - count: bool = False - """[coverage] Alternative couting algorithm""" - - min_cluster_size: int = 0 - """[reference] Minimum cluster size to keep in reference profiles. 0 for no clustering""" - - sample_sex: CnvkitSex | None = None - """[reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing""" - - male_reference: bool = False - """[reference & sex] Create male reference""" - - gc_correction: bool = True - """[reference] Use GC correction""" - - edge_correction: bool | None = None - """[reference] Use edge correction (automatic when None, edge correction for WES only)""" - - rmask_correction: bool = True - """[reference] Use rmask correction""" - - drop_low_coverage: bool = False - """[metrics] Drop very-low-coverage bins before calculations""" - + path_target_interval_list_mapping: list[LibraryKitEntry] = [] + """ + Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), + recommended by PureCN author + """ -class Access(SnappyModel): - """Creates access file for cnvkit, based on genomic sequence & excluded regions (optionally)""" + sample_sex: Sex = Sex() + """Sets the sex of all normals used in the panel""" - exclude: list[str] = [] - """[access] Bed file of regions to exclude (mappability, blacklisted, ...)""" + path_access: str | None = None + """Overrides access when not None""" - min_gap_size: int = 0 - """[access] Minimum gap size between accessible sequence regions (0: use default value)""" + ignore_chroms: list[str] = [] + """Additional contigs to ignore""" class GenomeName(enum.StrEnum): @@ -147,7 +60,7 @@ class PureCn(SnappyModel): path_normals_list: str = "" """Optional file listing libraries to include in panel""" - path_bait_regions: str + # targets_definition: list[LibraryKitEntry] = [] """ Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), recommended by PureCN author @@ -161,7 +74,7 @@ class PureCn(SnappyModel): EnumField(GenomeName, json_schema_extra={"options": {"unknown"}}), ] = "unknown" - enrichment_kit_name: str = "unknown" + path_target_interval_list_mapping: list[LibraryKitEntry] = [] """For filename only...""" mappability: str = "" @@ -212,6 +125,4 @@ class PanelOfNormals(SnappyStepModel, validators.ToolsMixin): cnvkit: CnvKit | None = None - access: Access | None = None - purecn: PureCn | None = None diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 84eba37a8..1e5a4d097 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -221,6 +221,10 @@ class SomaticCnvCallingStepPart(BaseStepPart): def __init__(self, parent: "SomaticCnvCallingWorkflow"): super().__init__(parent) + self.ignored = [] + if len(self.config.get("ignore_chroms", [])) > 0: + self.ignored += self.config.ignore_chroms + class CnvKitStepPart(SomaticCnvCallingStepPart): """Perform somatic targeted CNV calling using cnvkit""" @@ -248,6 +252,10 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): # Overwrite defaults default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h + resource_usage = { + "coverage": ResourceUsage(threads=8, time="11:59:59", memory="7680M"), + "segment": ResourceUsage(threads=8, time="11:59:59", memory="7680M"), + } def __init__(self, parent: SomaticCnvCallingStepPart): super().__init__(parent) @@ -273,6 +281,9 @@ def __init__(self, parent: SomaticCnvCallingStepPart): self.cfg: CnvkitConfig = self.config.get(self.name) self.pon_source = self.cfg.panel_of_normals.source + self.ignored += self.cfg.ignore_chroms + self.ignored = set(self.ignored) + self._set_cnvkit_pipeline_logic() self.path_baits = self._get_path_baits() @@ -285,7 +296,7 @@ def __init__(self, parent: SomaticCnvCallingStepPart): [x.purity is None for x in self.tumors.values()] ), "Missing purity value from samplesheet" - self.base_out = "work/{mapper}.cnvkit/out/cnvkit." + self.base_out = "work/{mapper}.cnvkit/out/{mapper}.cnvkit." self.base_out_lib = ( "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." ) @@ -440,7 +451,7 @@ def get_log_file(self, action): # Validate action self._validate_action(action) - base_log = "work/{mapper}.cnvkit/log/cnvkit." + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." if action in ("access", "antitarget"): @@ -484,6 +495,9 @@ def get_log_file(self, action): def get_result_files(self, library_name: str, mapper: str) -> list[str]: """Files to symlink to output""" + if not (self.is_wes or self.is_wgs): + return [] + base_out_lib = ( "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}." ).format(mapper=mapper, library_name=library_name) @@ -554,6 +568,7 @@ def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, "reference": self.w_config.static_data_config.reference.path, "min-gap-size": self.cfg.access.min_gap_size, "exclude": self.cfg.access.exclude, + "ignore_chroms": list(self.ignored), } # ----- Autobin (never used directly, only to compute target size in WGS settings) ------------ @@ -666,7 +681,10 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: # BAM/BAI file ngs_mapping = self.parent.sub_workflows["ngs_mapping"] base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files = {"bam": ngs_mapping(base_path + ".bam")} + input_files = { + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), + } # Region (target or antitarget) file if self.build_ref: @@ -684,7 +702,9 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: input_files["intervals"] = self.base_out.format(**wildcards) + "{region}.bed" elif self.pon_source == PanelOfNormalsOrigin.COHORT: panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] - base_path = "output/{mapper}.cnvkit/out/cnvkit.{region}.bed" + base_path = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{region}.bed".format( + **wildcards + ) input_files["intervals"] = panel_of_normals(base_path) return input_files @@ -727,11 +747,10 @@ def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[s "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, "no-edge": not self.cfg.get("edge", self.is_wes), + "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.cluster: args["min-cluster-size"] = self.cfg.min_cluster_size - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome sample_sex = self._get_sample_sex(wildcards.get("library_name", None)) if sample_sex is not None: args["sample-sex"] = str(sample_sex) @@ -760,7 +779,9 @@ def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: input_files["reference"] = self.base_out.format(**wildcards) + "reference.cnn" elif self.pon_source == PanelOfNormalsOrigin.COHORT: panel_of_normals = self.parent.sub_workflows["panel_of_normals_cnvkit"] - base_path = "output/{mapper}.cnvkit/out/cnvkit.panel_of_normals.cnn" + base_path = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn".format( + **wildcards + ) input_files["reference"] = panel_of_normals(base_path) return input_files @@ -770,10 +791,9 @@ def _get_args_fix(self, wildcards: Wildcards, input: InputFiles) -> dict[str, st "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, "no-edge": not self.cfg.get("edge", self.is_wes), + "diploid-parx-genome": self.cfg.diploid_parx_genome, } args["sample-id"] = wildcards.library_name - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome if "reference" not in args: args["reference"] = self.cfg.panel_of_normals.path_panel_of_normals return args @@ -848,14 +868,13 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s "thresholds": self.cfg.call.thresholds, "drop-low-coverage": self.cfg.drop_low_coverage, "male-reference": self.cfg.male_reference, + "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.call.center_at is not None: args["center-at"] = self.cfg.call.center_at else: if self.cfg.call.center is not None: args["center"] = self.cfg.call.center - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome if self.cfg.somatic_variant_calling.enabled: args |= self._variants_args(wildcards, input) if "variants" not in args: @@ -968,9 +987,8 @@ def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict "alpha": self.cfg.genemetrics.alpha, "bootstrap": self.cfg.genemetrics.bootstrap, "stats": [x.replace("t-test", "ttest") for x in self.cfg.genemetrics.stats], + "diploid-parx-genome": self.cfg.diploid_parx_genome, } - if self.cfg.diploid_parx_genome: - args["diploid-parx-genome"] = self.cfg.diploid_parx_genome sample_sex = self._get_sample_sex(wildcards.library_name) if sample_sex is not None: args["sample-sex"] = str(sample_sex) @@ -1126,14 +1144,16 @@ def _match_normals(self, valid_dna_libraries: list[LibraryInfo]) -> dict[str, st def _optionally_register_subworkflow(self, subworkflow: str): for tool in set(self.config.tools.wgs + self.config.tools.wes): - assert self.config.get(tool) is not None, f"Requested tool '{tool}' not configured" cfg = self.config.get(tool) - subworkflow_config = cfg.get(subworkflow) + subworkflow_config = cfg.get(subworkflow, None) if ( - subworkflow_config + subworkflow_config is not None and subworkflow_config.enabled and str(subworkflow_config.source) == "cohort" ): + assert ( + self.w_config.step_config.get(subworkflow, None) is not None + ), f"Upstream step {subworkflow} not configured" self.register_sub_workflow( subworkflow, subworkflow_config.get(f"path_{subworkflow}"), diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_cnv_calling/model.py index 2d3aef2d9..970c55fa1 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/model.py @@ -5,8 +5,6 @@ from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel from snappy_pipeline.models.cnvkit import Cnvkit as CnvkitGeneric -from snappy_pipeline.models.purecn import IntervalFilter -from snappy_pipeline.models.purecn import Segmentation as PureCNSegmentation from snappy_pipeline.models.purecn import PureCN as PureCNBase from snappy_pipeline.models.purecn import Variant as PureCNVariantParams from snappy_pipeline.models.common import LibraryKitEntry, Sex @@ -252,6 +250,9 @@ def ensure_purity_not_auto(self): path_access: str | None = None """Overrides access when not None""" + ignore_chroms: list[str] = [] + """List of contig name patterns to ignore for processing""" + class SomaticCnvCalling(SnappyStepModel): path_ngs_mapping: str = "../ngs_mapping" @@ -264,3 +265,6 @@ class SomaticCnvCalling(SnappyStepModel): purecn: PureCN | None = None sequenza: Sequenza | None = None control_freec: ControlFreec | None = None + + ignore_chroms: list[str] = [] + """List of contig name patterns to ignore for processing""" diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 2c2d8faf8..4afbbaf99 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -10,6 +10,7 @@ base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) sys.path.insert(0, base_dir) +from snappy_wrappers.tools.genome_windows import ignore_chroms from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" @@ -17,6 +18,18 @@ args = snakemake.params.get("args", {}) +prefix = "" + +# Add the "ignore_chrom" contents to the excluded regions +if len(args.get("ignore_chroms", [])) > 0: + ignored_contigs = ignore_chroms(args["reference"], args["ignore_chroms"], return_ignored=True) + lines = ["cat << __EOF > $TMPDIR/ignore_chroms.bed"] + for (contig_name, contig_length) in ignored_contigs: + lines.append(f"{contig_name}\t0\t{contig_length}") + lines.append("__EOF") + prefix = "\n".join(lines) + "\n" + args["exclude"].append("$TMPDIR/ignore_chroms.bed") + cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ @@ -29,4 +42,4 @@ exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]), ) -CnvkitWrapper(snakemake, cmd).run() +CnvkitWrapper(snakemake, prefix + cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index c7009727f..5fb01d78b 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -20,8 +20,8 @@ cmd = r""" cnvkit.py antitarget \ -o {snakemake.output.antitarget} \ - --avg-size {args[avg-size]} {min_size} - --access {files[access]} \ + --avg-size {args[avg-size]} {min_size} \ + --access {args[access]} \ {args[target]} """.format( snakemake=snakemake, diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index 9711475ed..5020ac227 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -29,8 +29,8 @@ args=args, out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", - access=f"--access {args['access']}" if "access" in args else "", - target=f"--targets {args['target']}" if "target" in args else "", + access=f"--access {args['access']}" if args.get("access", None) is not None else "", + target=f"--targets {args['target']}" if args.get("target", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index e6bfbd2ae..7cf317aa6 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -36,7 +36,7 @@ -o {snakemake.output.calls} \ --method {args[method]} {thresholds} \ {filter} \ - {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} \ + {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} {diploid_parx_genome} \ {purity} {ploidy} \ {variants} \ {args[segments]} @@ -53,6 +53,7 @@ drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", male_reference=f"--male-reference" if args.get("male-reference", False) else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid_parx_genome']}" if args.get("diploid-parx-genome", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 7d843f650..6a33406da 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -39,9 +39,9 @@ no_gc="--no-gc" if args.get("no-gc", False) else "", no_edge="--no-edge" if args.get("no-edge", False) else "", no_rmask="--no-rmask" if args.get("no-rmask", False) else "", - min_cluster_size=f"--min-cluster-size {args['min-cluster-size']}" if "min-cluster-size" in args else "", - sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", - diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "" + min_cluster_size=f"--min-cluster-size {args['min-cluster-size']}" if args.get("min-cluster-size", None) is not None else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get("diploid-parx-genome", None) is not None else "" ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py index a50a32e7e..f9c370b36 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -27,8 +27,8 @@ args=args, drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", male_reference="--male-reference" if args.get("male-reference", False) else "", - sample_sex=f"--sample-sex {args['sample-sex']}" if "sample-sex" in args else "", - diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if "diploid-parx-genome" in args else "", + sample_sex=f"--sample-sex {args['sample-sex']}" if args.get("sample-sex", None) is not None else "", + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get("diploid-parx-genome", None) is not None else "", stats=" ".join([f"--{stat}" for stat in args["stats"]]), ) diff --git a/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml b/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py new file mode 100644 index 000000000..1869d8040 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py sex""" + +import os +import re +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +cmd = r""" +cnvkit.py sex \ + -o {snakemake.output.sex} \ + {diploid_parx_genome} \ + {coverages} +""".format( + snakemake=snakemake, + args=args, + diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get('diploid-parx-genome', None) is not None else "", + coverages=" ".join(args["coverages"]), +) + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index a3f72f3b7..37e1bd9c2 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -26,9 +26,9 @@ """.format( snakemake=snakemake, args=args, - avg_size=f"--avg-size {args['avg-size']}" if args['avg-size'] is not None else "", + avg_size=f"--avg-size {args['avg-size']}" if args.get("avg-size", None) is not None else "", split=f"--split" if args.get("split", False) else "", - annotate=f"--annotate {args['annotate']}" if "annotate" in args else "", + annotate=f"--annotate {args['annotate']}" if args.get("annotate", None) is not None else "", short_names="--short-names" if args.get("short-names", False) else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 86870f4c8..5ce31759d 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -38,21 +38,25 @@ def minimal_config(): path_index: /path/to/bwa/index.fa panel_of_normals: - tools: ['mutect2', 'cnvkit', 'access', 'purecn'] + tools: ['mutect2', 'cnvkit', 'purecn'] + ignore_chroms: [GL*] path_ngs_mapping: ../ngs_mapping mutect2: germline_resource: /path/to/germline_resource.vcf path_normals_list: "" cnvkit: - path_target_regions: "" + ignore_chroms: [MT] + path_target_interval_list_mapping: [] path_normals_list: "" + diploid_parx_genome: GRCh38 purecn: path_normals_list: "" - path_bait_regions: /path/to/baits/regions.bed + path_target_interval_list_mapping: + - name: panel + pattern: panel + path: /path/to/baits.bed path_genomicsDB: /path/to/mutect2/genomicsDB genome_name: "unknown" - access: - exclude: [/path/to/exclude.bed] data_sets: first_batch: @@ -76,12 +80,20 @@ def panel_of_normals_workflow( config_paths, cancer_sheet_fake_fs, aligner_indices_fake_fs, + autobin_result_pon_fake_fs, mocker, ): """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" + # Patch out file-system to enable reading autobin output + autobin_result_pon_fake_fs.fs.create_file( + file_path="work/bwa.cnvkit/out/bwa.cnvkit.autobin.txt", + contents="Target: -1 2000\n", + create_missing_dirs=True, + ) # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.panel_of_normals", autobin_result_pon_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -199,357 +211,6 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): assert actual == expected, msg_error -# Tests for CnvkitStepPart ------------------------------------------------------------------------ - - -def test_cnvkit_step_part_get_input_files_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_access()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - actual = panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards) - assert actual == {} - - -def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_access()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "bams": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", - ], - "access": "work/bwa.cnvkit/out/cnvkit.access.bed", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_target()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "access": "work/bwa.cnvkit/out/cnvkit.access.bed", - "avg_size": "work/bwa.cnvkit/out/cnvkit.autobin.txt", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_antitarget()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) - assert actual == {} - - -def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_coverage()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - "normal_library": "P001-N1-DNA1-WGS1", - "interval": "target", - } - ) - expected = { - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "intervals": "work/bwa.cnvkit/out/cnvkit.target.bed", - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_create_panel()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "normals": [ - "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_access()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"reference": "/path/to/ref.fa", "min_gap_size": 5000} - actual = panel_of_normals_workflow.get_args("cnvkit", "access")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_autobin()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"method": "wgs", "bp_per_bin": 50000} - actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_target()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"annotate": "/path/to/annotations.gtf", "split": True} - actual = panel_of_normals_workflow.get_args("cnvkit", "target")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_antitarget(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_antitarget()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"avg_size": 0, "min_size": 0} - actual = panel_of_normals_workflow.get_args("cnvkit", "antitarget")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_coverage()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"reference": "/path/to/ref.fa", "min_mapq": 0} - actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_args_create_panel(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_create_panel()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = {"reference": "/path/to/ref.fa", "no_edge": True} - actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_target()""" - expected = { - "target": "work/{mapper}.cnvkit/out/cnvkit.target.bed", - "target_md5": "work/{mapper}.cnvkit/out/cnvkit.target.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_antitarget()""" - expected = { - "antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_coverage()""" - expected = { - "coverage": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn", - "coverage_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.{interval}coverage.cnn.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_create_panel()""" - expected = { - "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_target()""" - base_name_out = "work/{mapper}.cnvkit/log/cnvkit.target" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.target.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_antitarget()""" - base_name_out = "work/{mapper}.cnvkit/log/cnvkit.antitarget" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/cnvkit.antitarget.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_coverage()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.{interval}coverage.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_create_panel()""" - base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" - expected = get_expected_log_files_dict(base_out=base_name_out) - expected["sh"] = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") - assert actual == expected - - -def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): - """Tests CvnkitStepPart.get_resource_usage()""" - # Define expected: default defined workflow.abstract - target_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - antitarget_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - coverage_expected_dict = { - "threads": 8, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - reference_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - - # Evaluate action `target` - for resource, expected in target_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'target'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource)() - assert actual == expected, msg_error - - # Evaluate action `antitarget` - for resource, expected in antitarget_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource)() - assert actual == expected, msg_error - - # Evaluate action `coverage` - for resource, expected in coverage_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource)() - assert actual == expected, msg_error - - # Evaluate action `create_panel` - for resource, expected in reference_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() - assert actual == expected, msg_error - - -# Tests for AccessStepPart ------------------------------------------------------------------------- - - -def test_access_step_part_get_input_files_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_input_files_run()""" - assert panel_of_normals_workflow.get_input_files("access", "run") is None - - -def test_access_step_part_get_args_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_args_run()""" - expected = { - "reference": "/path/to/ref.fa", - "exclude": ["/path/to/exclude.bed"], - "min_gap_size": 0 - } - actual = panel_of_normals_workflow.get_args("access", "run") - assert actual == expected - - -def test_access_step_part_get_output_files_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_output_files_run()""" - expected = { - "access": "work/access/out/access.bed", - "access_md5": "work/access/out/access.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("access", "run") - assert actual == expected - - -def test_access_step_part_get_log_file_run(panel_of_normals_workflow): - """Tests AccessStepPart._get_log_file_run()""" - expected = get_expected_log_files_dict(base_out="work/access/log/access") - expected["sh"] = "work/access/log/access.sh" - expected["sh_md5"] = expected["sh"] + ".md5" - actual = panel_of_normals_workflow.get_log_file("access", "run") - assert actual == expected - - -def test_access_step_part_get_resource_usage(panel_of_normals_workflow): - """Tests AccessStepPart.get_resource_usage()""" - # Define expected: default defined workflow.abstract - expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'run'." - actual = panel_of_normals_workflow.get_resource("access", "run", resource)() - assert actual == expected, msg_error - - # Tests for PureCnStepPart ------------------------------------------------------------------------- @@ -577,12 +238,12 @@ def test_purecn_step_part_get_input_files_prepare(panel_of_normals_workflow): def test_purecn_step_part_get_output_files_prepare(panel_of_normals_workflow): """Tests PureCnStepPart._get_output_files_prepare()""" expected = { - "intervals": "work/purecn/out/unknown_unknown.list", - "optimized": "work/purecn/out/unknown_unknown.bed.gz", - "tbi": "work/purecn/out/unknown_unknown.bed.gz.tbi", - "intervals_md5": "work/purecn/out/unknown_unknown.list.md5", - "optimized_md5": "work/purecn/out/unknown_unknown.bed.gz.md5", - "tbi_md5": "work/purecn/out/unknown_unknown.bed.gz.tbi.md5", + "intervals": "work/purecn/out/panel_unknown.list", + "optimized": "work/purecn/out/panel_unknown.bed.gz", + "tbi": "work/purecn/out/panel_unknown.bed.gz.tbi", + "intervals_md5": "work/purecn/out/panel_unknown.list.md5", + "optimized_md5": "work/purecn/out/panel_unknown.bed.gz.md5", + "tbi_md5": "work/purecn/out/panel_unknown.bed.gz.tbi.md5", } actual = panel_of_normals_workflow.get_output_files("purecn", "prepare") assert actual == expected @@ -590,7 +251,7 @@ def test_purecn_step_part_get_output_files_prepare(panel_of_normals_workflow): def test_purecn_step_part_get_log_file_prepare(panel_of_normals_workflow): """Tests PureCnStepPart._get_log_file_prepare()""" - expected = get_expected_log_files_dict(base_out="work/purecn/log/unknown_unknown") + expected = get_expected_log_files_dict(base_out="work/purecn/log/panel_unknown") actual = panel_of_normals_workflow.get_log_file("purecn", "prepare") assert actual == expected @@ -605,7 +266,7 @@ def test_purecn_step_part_get_input_files_coverage(panel_of_normals_workflow): ) expected = { "container": "work/containers/out/purecn.simg", - "intervals": "work/purecn/out/unknown_unknown.list", + "intervals": "work/purecn/out/panel_unknown.list", "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", } actual = panel_of_normals_workflow.get_input_files("purecn", "coverage")(wildcards) @@ -681,13 +342,162 @@ def test_purecn_step_part_get_resource_usage(panel_of_normals_workflow): assert actual == value +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + actual = panel_of_normals_workflow.get_args("cnvkit", "access")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards), + ) + if actual.get("ignore_chroms", None) is not None: + actual["ignore_chroms"].sort() + expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} + assert actual == expected + + +def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "bams": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", + ], + "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + "method": "wgs", + "bp-per-bin": 50000, + } + actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_target()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + "avg-size": 2000, + "split": True, + "annotate": "/path/to/annotations.gtf", + "short-names": True, + + } + actual = panel_of_normals_workflow.get_args("cnvkit", "target")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", + "region": "target", + } + ) + expected = { + "intervals": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "reference": "/path/to/ref.fa", + "min-mapq": 0, + "count": False, + } + actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "normals": [ + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.target.cnn", + ], + "reference": "/path/to/ref.fa", + "cluster": False, + "no-gc": False, + "no-edge": True, + "no-rmask": False, + "male-reference": False, + "diploid-parx-genome": "GRCh38", + } + actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")( + wildcards, + panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards), + ) + assert actual == expected + + +def test_cnvkit_step_parts_get_output_files(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_output_files() for all actions""" + actions = { + "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, + "autobin": {"result": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.autobin.txt"}, + "target": {"target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed"}, + "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "create_panel": {"reference": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"}, + "sex": {"sex": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.sex.tsv"}, + } + for action, result in actions.items(): + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_output_files("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_log_file() for all actions""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + actions = ("autobin", "target", "create_panel", "sex") + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit" + for action in actions: + result = {k: base_log + f".{action}.{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_log_file("cnvkit", action) + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_access(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_log_file() for access""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_log_file("cnvkit", "access") + assert actual == expected + + +def test_cnvkit_step_parts_get_log_file_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart.get_log_file() for coverage""" + exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) + base_log = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}" + result = {k: base_log + f".{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + # PanelOfNormalsWorkflow -------------------------------------------------------------------------- def test_panel_of_normals_workflow(panel_of_normals_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["access", "cnvkit", "link_out", "mutect2", "purecn"] + expected = ["cnvkit", "link_out", "mutect2", "purecn"] actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) assert actual == expected @@ -711,37 +521,20 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += get_expected_log_files_dict(base_out=tpl.format(mapper=mapper)).values() # Now for basic cnvkit files (panel of normal only) - tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}{chksum}" expected += [ - tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) + tpl.format(mapper=mapper, substep=substep, ext=ext, chksum=chksum) + for chksum in ("", ".md5") + for (substep, ext) in (("panel_of_normals", "cnn"), ("sex", "tsv"), ("target", "bed")) + for mapper in ("bwa",) ] - tpl = "output/{mapper}.cnvkit/out/cnvkit.{substep}.{ext}" - for substep in ("target", "antitarget"): - expected += [ - tpl.format(substep=substep, mapper=mapper, ext=ext) - for ext in ("bed", "bed.md5") - for mapper in ("bwa",) - ] # add log files - tpl = "output/{mapper}.cnvkit/log/cnvkit.{substep}" - for substep in ("target", "antitarget"): + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" + for substep in ("create_panel", "sex", "target"): for mapper in ("bwa",): base_out = tpl.format(mapper=mapper, substep=substep) expected += get_expected_log_files_dict(base_out=base_out).values() expected += [base_out + ".sh", base_out + ".sh.md5"] - tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" - for mapper in ("bwa",): - base_out = tpl.format(mapper=mapper, substep=substep) - expected += get_expected_log_files_dict(base_out=base_out).values() - expected += [base_out + ".sh", base_out + ".sh.md5"] - - # Access - tpl = "output/access/out/access.{ext}" - expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] - expected += get_expected_log_files_dict( - base_out="output/access/log/access" - ).values() - expected += ["output/access/log/access.sh", "output/access/log/access.sh.md5"] # PureCN tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.rds{chksum}" @@ -755,14 +548,14 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += get_expected_log_files_dict( base_out="output/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals".format(mapper="bwa") ).values() - tpl = "output/purecn/out/unknown_unknown.{ext}{chksum}" + tpl = "output/purecn/out/panel_unknown.{ext}{chksum}" expected += [ tpl.format(ext=ext, chksum=chksum) for ext in ("list", "bed.gz", "bed.gz.tbi") for chksum in ("", ".md5") ] expected += get_expected_log_files_dict( - base_out="output/purecn/log/unknown_unknown".format() + base_out="output/purecn/log/panel_unknown".format() ).values() expected = list(sorted(expected)) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index fe954b333..e9e7daa42 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -50,10 +50,12 @@ def minimal_config(): b_af_loci: /path/to/locii.bed somatic_cnv_calling: + ignore_chroms: [GL*] tools: wgs: ['cnvkit'] path_ngs_mapping: ../ngs_mapping cnvkit: + ignore_chroms: [MT] diploid_parx_genome: GRCh38 panel_of_normals: source: paired @@ -92,14 +94,14 @@ def somatic_cnv_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, - autobin_result_fake_fs, + autobin_result_calling_fake_fs, purity_result_fake_fs, aligner_indices_fake_fs, mocker, ): """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" # Patch out file-system to enable reading autobin output - autobin_result_fake_fs.fs.create_file( + autobin_result_calling_fake_fs.fs.create_file( file_path="work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.autobin.txt", contents="Target: -1 2000\n", create_missing_dirs=True, @@ -111,7 +113,7 @@ def somatic_cnv_calling_workflow( create_missing_dirs=True, ) # Patch out file-system related things in abstract (the crawling link in step is defined there) - patch_module_fs("snappy_pipeline.workflows.somatic_cnv_calling", autobin_result_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.somatic_cnv_calling", autobin_result_calling_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we @@ -146,7 +148,9 @@ def test_cnvkit_step_part_get_args_access(somatic_cnv_calling_workflow): wildcards, somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards), ) - expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": []} + if actual.get("ignore_chroms", None) is not None: + actual["ignore_chroms"].sort() + expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} assert actual == expected @@ -160,7 +164,7 @@ def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): ) expected = { "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], - "access": "work/bwa.cnvkit/out/cnvkit.access.bed", + "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", "method": "wgs", "bp-per-bin": 50000, } @@ -180,7 +184,7 @@ def test_cnvkit_step_part_get_args_target(somatic_cnv_calling_workflow): } ) expected = { - "interval": "work/bwa.cnvkit/out/cnvkit.access.bed", + "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", "avg-size": 2000, "split": True, "annotate": "/path/to/annotations.gtf", @@ -206,6 +210,7 @@ def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): expected = { "intervals": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.bed", "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", "reference": "/path/to/ref.fa", "min-mapq": 0, "count": False, @@ -453,10 +458,10 @@ def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): """Tests CnvkitStepPart.get_output_files() for all actions""" actions = { - "access": {"access": "work/{mapper}.cnvkit/out/cnvkit.access.bed"}, + "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, "autobin": {"result": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt"}, "target": {"target": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.target.bed"}, - "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/cnvkit.antitarget.bed"}, + "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, "reference": {"reference": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.reference.cnn"}, "fix": {"ratios": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr"}, @@ -492,7 +497,7 @@ def test_cnvkit_step_parts_get_log_file(somatic_cnv_calling_workflow): def test_cnvkit_step_parts_get_log_file_access(somatic_cnv_calling_workflow): """Tests CnvkitStepPart.get_log_file() for access""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - base_log = "work/{mapper}.cnvkit/log/cnvkit.access" + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" result = {k: base_log + f".{v}" for k, v in exts} expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "access") From bdeef1408f857af4650462c0eea82a0174adb8e1 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 28 Nov 2024 18:32:21 +0100 Subject: [PATCH 40/46] tests: removed cnvkit from older cnv calling steps --- .../somatic_cnv_checking/__init__.py | 27 ++--- .../workflows/somatic_cnv_checking/model.py | 23 +--- .../test_workflows_somatic_cnv_checking.py | 29 +++-- ...kflows_somatic_targeted_seq_cnv_calling.py | 102 +++++++++--------- .../test_workflows_somatic_wgs_cnv_calling.py | 88 +++++++-------- 5 files changed, 125 insertions(+), 144 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py index febda8197..807a2f91d 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py @@ -72,10 +72,7 @@ ResourceUsage, ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow -from snappy_pipeline.workflows.somatic_targeted_seq_cnv_calling import ( - SomaticTargetedSeqCnvCallingWorkflow, -) -from snappy_pipeline.workflows.somatic_wgs_cnv_calling import SomaticWgsCnvCallingWorkflow +from snappy_pipeline.workflows.somatic_cnv_calling import SomaticCnvCallingWorkflow from .model import SomaticCnvChecking as SomaticCnvCheckingConfigModel @@ -294,23 +291,14 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) workdir, config_model_class=SomaticCnvCheckingConfigModel, previous_steps=( - SomaticTargetedSeqCnvCallingWorkflow, - SomaticWgsCnvCallingWorkflow, + SomaticCnvCallingWorkflow, NgsMappingWorkflow, ), ) - if self.config.path_cnv_calling and self.config.cnv_assay_type: - if self.config.cnv_assay_type == "WES": - cnv_calling = "somatic_targeted_seq_cnv_calling" - elif self.config.cnv_assay_type == "WES": - cnv_calling = "somatic_wgs_cnv_calling" - else: - raise InvalidConfiguration( - "Illegal cnv_assay_type {}, must be either WES or WGS".format( - self.config.cnv_assay_type - ) - ) - self.register_sub_workflow(cnv_calling, self.config.path_cnv_calling, "cnv_calling") + if self.config.path_cnv_calling: + self.register_sub_workflow( + "somatic_cnv_calling", self.config.path_cnv_calling, "cnv_calling" + ) self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -367,8 +355,9 @@ def get_result_files(self): ext = {"out": [".vcf.gz", ".vcf.gz.tbi"]} if self.config.path_cnv_calling: # CNV avaliable + # TODO: make the tool library-dependent (supporting both wes & wgs) name_pattern = "{mapper}.{caller}.{library_name}" - callers = self.w_config.step_config["somatic_targeted_seq_cnv_calling"].tools + callers = self.w_config.step_config["somatic_cnv_calling"].tools.wgs ext["out"] += [".tsv"] ext["report"] = (".cnv.pdf", ".locus.pdf", ".segment.pdf") ext["log"] = [ diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/model.py b/snappy_pipeline/workflows/somatic_cnv_checking/model.py index f7ec574a5..704b9aceb 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/model.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/model.py @@ -1,27 +1,14 @@ -import enum from typing import Annotated -from pydantic import Field, model_validator +from pydantic import Field from snappy_pipeline.models import SnappyStepModel -class CnvAssayType(enum.StrEnum): - WES = "WES" - WGS = "WGS" - - class SomaticCnvChecking(SnappyStepModel): path_ngs_mapping: str = "../ngs_mapping" - path_cnv_calling: Annotated[str, Field(examples=["../somatic_targeted_seq_cnv_calling"])] = "" - - cnv_assay_type: CnvAssayType | None = None - """ - Empty: no CNV, - WES for somatic_targeted_seq_snv_calling step, - WGS for somatic_wgs_cnv_calling step - """ + path_cnv_calling: Annotated[str, Field(examples=["../somatic_cnv_calling"])] = "" excluded_regions: str = "" """Bed file of regions to be excluded""" @@ -34,9 +21,3 @@ class SomaticCnvChecking(SnappyStepModel): min_baf: Annotated[float, Field(0.4, ge=0, le=0.5)] """Maximum BAF to consider variant as heterozygous (between 0 & 1/2)""" - - @model_validator(mode="after") - def ensure_cnv_assay_type_is_specified(self): - if self.path_cnv_calling and not self.cnv_assay_type: - raise ValueError("CNV assay type must be specified") - return self diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py index 733b69284..ed024080d 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py @@ -12,7 +12,7 @@ from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict from .conftest import patch_module_fs -__author__ = "Manuel Holtgrewe " +__author__ = "Eric Blanc " @pytest.fixture(scope="module") # otherwise: performance issues @@ -35,17 +35,28 @@ def minimal_config(): bwa: path_index: /path/to/bwa/index.fa - somatic_targeted_seq_cnv_calling: - tools: ["cnvkit"] + somatic_cnv_calling: + tools: + wgs: ["cnvkit"] cnvkit: - path_target: DUMMY - path_antitarget: DUMMY - path_panel_of_normals: DUMMY + diploid_parx_genome: GRCh38 + panel_of_normals: + source: paired + somatic_variant_calling: + enabled: False + source: cohort + tool: mutect2 + path_somatic_variant_calling: ../somatic_variant_calling + somatic_purity_ploidy_estimate: + enabled: False + source: cohort + tool: ascat + segment: + threshold: 0.0001 somatic_cnv_checking: path_ngs_mapping: ../ngs_mapping - path_cnv_calling: ../somatic_targeted_seq_cnv_calling - cnv_assay_type: WES + path_cnv_calling: ../somatic_cnv_calling data_sets: first_batch: @@ -71,7 +82,7 @@ def somatic_cnv_checking_workflow( aligner_indices_fake_fs, mocker, ): - """Return SomaticTargetedSeqCnvCallingWorkflow object pre-configured with germline sheet""" + """Return SomaticCnvCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index e49b5350a..815acdf73 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -41,14 +41,14 @@ def minimal_config(): somatic_targeted_seq_cnv_calling: tools: - cnvetti_on_target - - cnvkit + # - cnvkit - copywriter - sequenza - purecn - cnvkit: - path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + # cnvkit: + # path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + # path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + # path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn purecn: path_container: /path/to/purecn/container path_intervals: /path/to/interval/list @@ -1107,52 +1107,52 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call ) ] # cnvkit - tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="cnvkit") + "{ext}{md5}" - expected += [ - tpl.format(i=i, t=t, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ( - ".cnr", - "_dnacopy.seg", - ".bed.gz", - ".bed.gz.tbi", - ".seg", - ".vcf.gz", - ".vcf.gz.tbi", - ) - for md5 in ("", ".md5") - ] - tpl = ( - f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - + ".{plot}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - for md5 in ("", ".md5") - ] - tpl = ( - f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - + ".{plot}.chr{chrom}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) - for chrom in chain(range(1, 23), ("X", "Y")) - for md5 in ("", ".md5") - ] - tpl = ( - f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - + ".{report}.txt{md5}" - ) - expected += [ - tpl.format(i=i, t=t, report=report, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - for md5 in ("", ".md5") - ] + # tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="cnvkit") + "{ext}{md5}" + # expected += [ + # tpl.format(i=i, t=t, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for ext in ( + # ".cnr", + # "_dnacopy.seg", + # ".bed.gz", + # ".bed.gz.tbi", + # ".seg", + # ".vcf.gz", + # ".vcf.gz.tbi", + # ) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") + # + ".{plot}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") + # + ".{plot}.chr{chrom}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) + # for chrom in chain(range(1, 23), ("X", "Y")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") + # + ".{report}.txt{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, report=report, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + # for md5 in ("", ".md5") + # ] # copywriter tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="copywriter") + "_{ext}{md5}" expected += [ diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py index 40038a846..e09b8a349 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py @@ -47,17 +47,17 @@ def minimal_config(): - canvas - cnvetti - control_freec - - cnvkit + # - cnvkit tools_ngs_mapping: - bwa canvas: path_reference: /path/to/reference.fasta path_filter_bed: /path/to/filter.bed path_genome_folder: /path/to/genome/folder - cnvkit: - path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + # cnvkit: + # path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + # path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + # path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn cnvetti: {} control_freec: path_chrlenfile: /path/to/chrlenfile @@ -806,45 +806,45 @@ def test_somatic_cnv_calling_workflow(somatic_wgs_cnv_calling_workflow): for mapper in ("bwa",) for cnv_caller in ("control_freec",) ] - # -- add files from cnvkit - tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" - expected += [ - tpl.format(i=i, t=t, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") - for md5 in ("", ".md5") - ] - tpl = ( - "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" - "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - for md5 in ("", ".md5") - ] - tpl = ( - "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" - "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" - ) - expected += [ - tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) - for chrom in chain(range(1, 23), ("X", "Y")) - for md5 in ("", ".md5") - ] - tpl = ( - "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" - "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" - ) - expected += [ - tpl.format(i=i, t=t, report=report, md5=md5) - for i, t in ((1, 1), (2, 1), (2, 2)) - for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - for md5 in ("", ".md5") - ] + # # -- add files from cnvkit + # tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" + # expected += [ + # tpl.format(i=i, t=t, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") + # for md5 in ("", ".md5") + # ] + # tpl = ( + # "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + # "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + # "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) + # for chrom in chain(range(1, 23), ("X", "Y")) + # for md5 in ("", ".md5") + # ] + # tpl = ( + # "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + # "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" + # ) + # expected += [ + # tpl.format(i=i, t=t, report=report, md5=md5) + # for i, t in ((1, 1), (2, 1), (2, 2)) + # for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + # for md5 in ("", ".md5") + # ] # Perform the comparison expected = list(sorted(expected)) actual = list(sorted(somatic_wgs_cnv_calling_workflow.get_result_files())) From 7e7273d83cd70cd96c222c957a2d4cab888310db Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 5 Dec 2024 18:38:02 +0100 Subject: [PATCH 41/46] refactor: all files moved to snakemake.input, filenames in sync with cnvkit (as much as possible), added to documentation --- snappy_pipeline/models/cnvkit.py | 20 +- .../workflows/panel_of_normals/Snakefile | 38 + .../workflows/panel_of_normals/__init__.py | 264 ++++-- .../workflows/panel_of_normals/model.py | 16 +- .../workflows/somatic_cnv_calling/__init__.py | 242 +++-- .../somatic_cnv_calling/cnvkit.rules | 21 + snappy_wrappers/tools/chromosome_lengths.py | 850 ++++++++++++++++++ snappy_wrappers/tools/genome_windows.py | 75 +- .../wrappers/cnvkit/access/wrapper.py | 20 +- .../wrappers/cnvkit/antitarget/wrapper.py | 5 +- .../wrappers/cnvkit/autobin/wrapper.py | 6 +- .../wrappers/cnvkit/bintest/wrapper.py | 4 +- .../wrappers/cnvkit/call/wrapper.py | 12 +- .../wrappers/cnvkit/coverage/wrapper.py | 4 +- .../wrappers/cnvkit/fix/wrapper.py | 13 +- .../wrappers/cnvkit/ignore/environment.yaml | 1 + .../wrappers/cnvkit/ignore/wrapper.py | 33 + .../wrappers/cnvkit/plot/scatter/wrapper.py | 36 +- .../wrappers/cnvkit/reference/wrapper.py | 11 +- .../cnvkit/report/genemetrics/wrapper.py | 4 +- .../wrappers/cnvkit/report/metrics/wrapper.py | 6 +- .../cnvkit/report/segmetrics/wrapper.py | 4 +- .../wrappers/cnvkit/segment/wrapper.py | 8 +- .../wrappers/cnvkit/sex/wrapper.py | 2 +- .../wrappers/cnvkit/target/wrapper.py | 4 +- .../test_workflows_panel_of_normals.py | 164 ++-- .../test_workflows_somatic_cnv_calling.py | 326 ++++--- 27 files changed, 1766 insertions(+), 423 deletions(-) create mode 100644 snappy_wrappers/tools/chromosome_lengths.py create mode 120000 snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 8f9de72df..2063d52f7 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -182,8 +182,12 @@ class Call(SnappyModel): When this parameter is set, the centering method should be left empty. """ - filter: FilterMethod | None = None - """Merge segments flagged by the specified filter(s) with the adjacent segment(s).""" + filter: list[FilterMethod] | None = None + """ + Merge segments flagged by the specified filter(s) with the adjacent segment(s). + + When ``None``, ``segmetrics`` enabled & ``smooth_bootstrap`` is None, the behaviour is identical to ``batch``: filtering is done using ``ci``. + """ @model_validator(mode="after") def ensure_center_without_center_at(self) -> Self: @@ -234,8 +238,8 @@ class PlotScatter(Plot): """y-axis upper limit.""" y_min: float | None = None """y-axis lower limit.""" - fig_size: tuple[float, float] = (6.4, 4.8) - """Width and height of the plot in inches.""" + fig_size: tuple[float, float] = (12.256, 16.192) + """Width and height of the plot in centimeters (might depend on the locale).""" @model_validator(mode="after") def ensure_range_list_with_gene(self) -> Self: @@ -268,8 +272,12 @@ class ReportSegmetrics(Report): """Level to estimate confidence and prediction intervals; use with --ci and --pi.""" bootstrap: int = 100 """Number of bootstrap iterations to estimate confidence interval; use with --ci.""" - smooth_bootstrap: bool = False - """Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval""" + smooth_bootstrap: bool = True + """ + Apply Gaussian noise to bootstrap samples, a.k.a. smoothed bootstrap, to estimate confidence interval + + This is _NOT_ the ``cnvkit`` default, but it is automatically set in ``batch`` mode. + """ stats: list[ReportStats] = [ ReportStats.MEAN, ReportStats.MEDIAN, diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index e311589c3..fb5c8b6a0 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -104,7 +104,27 @@ rule panel_of_normals_mutect2_create_panel: # Write out the normals-only results for the normals -------------------------- +rule panel_of_normals_cnvkit_ignore: + input: + unpack(wf.get_input_files("cnvkit", "ignore")), + output: + **wf.get_output_files("cnvkit", "ignore"), + threads: wf.get_resource("cnvkit", "ignore", "threads") + resources: + time=wf.get_resource("cnvkit", "ignore", "time"), + memory=wf.get_resource("cnvkit", "ignore", "memory"), + partition=wf.get_resource("cnvkit", "ignore", "partition"), + log: + **wf.get_log_file("cnvkit", "ignore"), + params: + **{"args": wf.get_args("cnvkit", "ignore")}, + wrapper: + wf.wrapper_path("cnvkit/ignore") + + rule panel_of_normals_cnvkit_access: + input: + unpack(wf.get_input_files("cnvkit", "access")), output: **wf.get_output_files("cnvkit", "access"), threads: wf.get_resource("cnvkit", "access", "threads") @@ -231,6 +251,24 @@ rule panel_of_normals_cnvkit_sex: wf.wrapper_path("cnvkit/sex") +rule panel_of_normals_cnvkit_metrics: + input: + unpack(wf.get_input_files("cnvkit", "metrics")), + output: + **wf.get_output_files("cnvkit", "metrics"), + threads: wf.get_resource("cnvkit", "metrics", "threads") + resources: + time=wf.get_resource("cnvkit", "metrics", "time"), + memory=wf.get_resource("cnvkit", "metrics", "memory"), + partition=wf.get_resource("cnvkit", "metrics", "partition"), + log: + **wf.get_log_file("cnvkit", "metrics"), + params: + **{"args": wf.get_args("cnvkit", "metrics")}, + wrapper: + wf.wrapper_path("cnvkit/report/metrics") + + # Panel of normals (PureCN) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index b919fd480..4538a2ad7 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -44,10 +44,25 @@ Notes on the ``cnvkit`` workflow ================================ +-------- +Overview +-------- + ``cnvkit`` is a set of tools originally designed to call somatic copy number alterations from exome data. Its design is modular, which enables its use for whole genome and amplicon data. -Provided that sufficient normal samples are available, the ``cnvkit`` `documentation `_ +Because it was designed primarily for whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), +and the _antitargets_ (those regions accessible for CNV calling, but outside of enrichment). +The coverage of _targets_ and _antitargets_ are expected to be very different, +but there is still information to be gained in the _antitarget_ regions, +albeit at a much lower resolution than for _target_ regions. + +For WGS data, the _target_ regions generally cover the whole accessible genome, with empty _antitarget_ regions. + +.. tip:: For Agilent kits, the ``cnvkit`` authors recommend to use baits as targets. Baits are in the ``*_Covered.bed`` file. + +Provided that sufficient normal samples are available (`10 to 20 are considered sufficient `_), +the ``cnvkit`` `documentation `_ recommends the creation of a panel of normal (called ``reference``) for exome and whole genome data. .. note:: @@ -56,56 +71,48 @@ The actual workflow to generate this reference is slightly different between exome and whole genome data. The current implementation recapitulates the common practice, while still dispaching computations on multiple cluster nodes. ------------ -Access file ------------ - -``cnvkit`` can use a bed file describing the accessible regions for coverage computations. -The ``cnvkit`` distribution provides it for the ``GRCh37`` human genome release, but incompletely only for ``GRCh38``. -Therefore, a tentative ``access`` tool has been added, to generate this bed file when the user knows which locii should be excluded from coverage. -Its output (``output/cnvkit.access/out/cnvkit.access.bed``) is optional, but its presence impacts of the way the target and antitarget regions are computed in whole genome mode. - -.. note:: - - In a nutshell, for exome data, the accessibility file is only used to create antitarget regions. - These regions are essentially the accessible regions minus the target regions (with edge effect correction). +--------------------------------- +Regions accessible to CNV calling +--------------------------------- -Access files can be generated from the genome reference ``fasta`` file, and optionally ``bed`` file(s) containing regions to exclude from further computations. -In this case, the user must proceed in two steps: +``cnvkit`` needs to know about the regions of the genome accessible to CNV calling. +Typically, regions masked with ``N`` are excluded, but a user may also want to exclude +repeats, segmental duplications, or low complexity regions. -First, she needs to run the ``access`` tool to create the desired access file +There are multiple ways to get this information: -.. code-block:: yaml +1. The user can provide a ``bed`` file detailing accessible regions, using the ``path_access`` option in the ``access`` part of the configuration. +2. The user can specifically exclude regions using the ``exclude`` option is the ``access`` part of the configuration. + In this case, the pipeline will create an accessible regions file from the whole genome and the excluded parts. +3. Otherwise, the pipeline creates the accessible regions file from the reference genome sequence, by removing + only parts masked with ``N``. - panel_of_normals: - tools: [access] - access: - exclude: +.. note:: An additional constraints to pick option 3 is that the ``min_gap_size`` parameter cannot be set. -This will create ``output/cnvkit.access/out/cnvkit.access.bed`` from the genomic sequence & excluded regions. +.. note:: + Some short contigs (the mitochondrion, unplaced & unlocalized contigs, decoys, viral sequences, ...) are too short + for a reliable estimation of copy number changes. + ``snappy`` provides a generic way to ignore those contigs during processing (post-mapping), through the ``ignore_chroms`` configuration option. + This parameter is generally set at the _step_ level, typically ignoring decoys, HLA variants and viral sequences. + This is suitable for small variant calling (calling variants for genes on unplaced/unlocalized contigs is fine), + but not for CNA calling. + Therefore, ``ignore_chroms`` can also be set at the _tool_ level in the configuration. In that case, contigs from both options will be ignored. + Contigs ignored during panel of normals creation will _not_ be assessed during calling. -When there are no exclusion regions, the access file is automatically created using only the reference genome, and removing masked regions. +.. note:: In WES mode, using ``ignore_chroms`` options is generally not necessary, unless the baits definition includes small contigs. ------------------------ Panel of normal creation ------------------------ -If the user wants to create her own access file, then the panel of normal can only be created after the ``access`` tool has been run. -If she decides that the access file provided in the ``cnvkit`` distribution is suitable (no excluded region), -then she can skip the ``access`` tool step and directly creates her panel of normals. - -In both cases, the configuration might read: +By default, the panel is built using all normal samples in the cohort. +However, it is possible to select a sub-set of samples using the ``path_normals_list`` configruation option. +This is the path to a file with one library name per line. -.. code-block:: yaml +The current implementation doesn't allow for mixing WES & WGS data, not mixing multiple exome enrichment kits. +The selection of enrichment kit is done through the ``path_target_interval_list_mapping`` option. - panel_of_normals: - tools: [cnvkit] # , access] - access: # Even when created by the ``access`` tool. - path_target_regions: # Keep empty for WGS data - path_normals_list: # Keep empty to use all available normals - -Note that there is no provision (yet) to automatically create separate panel of normals for males & females. -If the number of samples collected in the same fashion is large enough, it is nevertheless the way to achieve best results. +TODO: implement support for cohorts collected using different enrichment kits (mixing WGS & WES is more difficult, and may never happen) ------- Reports @@ -135,12 +142,54 @@ To create the target regions from the baits (or from the accessible regions), the target average bin size must be set. There is a reasonable default value for exome data, but an additional ``autobin`` step is required for the whole genome data. In ``batch`` mode, this value is computed from the coverage over the full genome + .. note:: The ``cnvkit batch`` command also allows the creation of a flat reference, when there are no normal samples. This is not implemented in the ``panel_of_normals`` step, for obvious reasons. Using a flat reference for CNV computations is nevertheless possible, it is implemented in the ``somatic_cnv_calling`` step. +``cnvkit`` can infer the sex of a donor from a sample's coverage over sex chromosomes. +The decision is taken by the ratio of _G_ test statistics over the autosomes & sex chromosome coverage. +More precisely, the _G_ statistic is computed (with Yates continuity correction) from the contingency table built using +the number of bins with coverage higher or lower than the grand median, from the autosomes and the sex chromosome. + +.. math:: + G(x) = G \\left( \\begin{array}{cc} N(c_a > m) & N(c_x > m) \\\\ N(c_a < m) & N(c_x < m) \\end{array} \\right) + +where :math:`c_a` the coverage over the autosomes :math:`c_x` the coverage over the sex chromosome `x` & :math:`m = \\text{median}(c_a, c_x)`. +The coverages are defined as the base-2 logarithm of the coverage depth. + +The final score is obtained after shifting the coverages by 0, 1 or 3, depending on the case. + +.. math:: + \\text{score} = G(X)/G(X+1) \\cdot G(Y+3)/G(Y) + +When the score is higher than 1, the sex is inferred as male. + +For each sample, the scores are computed on target & antitarget regions separately. When the inferred sex are different, the anitarget is selected. + +.. warning: The sex inference results are questionable. We have observed similar behaviour as decribed `here `_. + +For validation, the _G_ statistic can be obtained from the ``*.cnn`` output using the following: + +.. code-block:: R + + coverage <- read.table("work/mapper.cnvkit.library/out/mapper.cnvkit.library.cnn", sep="\t", header=1) + median_test <- function(x, X=TRUE, shift=0, prefix=c("chr", "")) { + auto <- x$log2[x$chromosome %in% sprintf("%s%d", prefix, 1:22)] + if (X) sex <- x$log2[x$chromosome == sprintf("%sX", prefix)] + shift + else sex <- x$log2[x$chromosome == sprintf("%sY", prefix)] + shift + m <- median(c(auto, sex), na.rm=TRUE) + contigency <- cbind( + c(sum(auto>m, na.rm=TRUE), sum(autom, na.rm=TRUE), sum(sex list[str]: result_files = [] - result_files += list(self.get_output_files("create_panel").values()) - result_files += list(self.get_log_file("create_panel").values()) - - result_files += list(self.get_output_files("target").values()) - result_files += list(self.get_log_file("target").values()) - + actions = ["create_panel", "target", "sex", "metrics"] if self.libraryType == LibraryType.WES: - result_files += list(self.get_output_files("antitarget").values()) - result_files += list(self.get_log_file("antitarget").values()) + actions.append("antitarget") - result_files += list(self.get_output_files("sex").values()) - result_files += list(self.get_log_file("sex").values()) + for action in actions: + result_files += list(self.get_output_files(action).values()) + result_files += list(self.get_log_file(action).values()) return filter(lambda x: not x.endswith(".md5"), result_files) - def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: - assert self.create_access, "Access shouldn't be created, already available" - return {} + def _get_input_files_ignore(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"reference": self.w_config.static_data_config.reference.path} - def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + def _get_args_ignore(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"ignore_chroms": sorted(list(self.ignored))} + + def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: assert self.create_access, "Access shouldn't be created, already available" - return dict(input) | { + input_files = { "reference": self.w_config.static_data_config.reference.path, - "min-gap-size": self.cfg.access.min_gap_size, "exclude": self.cfg.access.exclude, - "ignore_chroms": list(self.ignored), } + if self.ignored: + input_files["ignore_chroms"] = self.base_out.format(**wildcards) + "ignored.bed" + return input_files + + def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "Access shouldn't be created, already available" + return {"min-gap-size": self.cfg.access.min_gap_size} def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: assert ( @@ -697,19 +768,18 @@ def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: input_files["access"] = self.base_out.format(**wildcards) + "access.bed" else: input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.cfg.path_access return input_files def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert ( self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" - args = dict(input) | {"bp-per-bin": 50000} - if self.plain_access: - args["method"] = "wgs" - else: - args["method"] = "amplicon" - if "target" not in args: - args["target"] = self.cfg.path_access + args = { + "bp-per-bin": 50000, + "method": "wgs" if self.plain_access else "amplicon", + } return args def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: @@ -719,7 +789,11 @@ def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: if self.create_access: input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" if self.compute_avg_target_size: - input_files["avg-size"] = self.base_out.format(**wildcards) + "autobin.txt" + input_files["avg_size"] = self.base_out.format(**wildcards) + "autobin.txt" + else: + input_files["interval"] = self.path_baits + if self.w_config.static_data_config.get("features", None): + input_files["annotate"] = self.w_config.static_data_config.features.path return input_files def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: @@ -727,30 +801,31 @@ def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, args = { "avg-size": self.cfg.target.avg_size, "split": self.cfg.target.split, - "interval": self.path_baits, } else: assert self.is_wgs, "Panel not implemented yet" - args = dict(input) | {"split": self.cfg.target.split} - if args.get("avg-size", None) is not None: - args["avg-size"] = self._read_autobin_output(args["avg-size"]) + args = {"split": self.cfg.target.split} + if input.get("avg_size", None) is not None: + args["avg-size"] = self._read_autobin_output(input.get("avg_size")) elif self.cfg.target.avg_size is not None: args["avg-size"] = self.cfg.target.avg_size else: args["avg-size"] = 5000 if self.w_config.static_data_config.get("features", None): - args["annotate"] = self.w_config.static_data_config.features.path args["short-names"] = self.cfg.target.short_names return args def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} if self.create_access: - input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + if not self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["access"] return input_files def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "avg-size": self.cfg.antitarget.avg_size, "min-size": self.cfg.antitarget.min_size, } @@ -767,40 +842,43 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: "intervals": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{region}.bed".format( **wildcards ), + "reference": self.w_config.static_data_config.reference.path, "bam": bam, "bai": bam + ".bai", } def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + return { "min-mapq": self.cfg.coverage.min_mapq, "count": self.cfg.coverage.count, } def _get_input_files_create_panel(self, wildcards: Wildcards) -> dict[str, str]: - tpl = self.base_out_lib + "target.cnn" + tpl = self.base_out_lib + "targetcoverage.cnn" targets = [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() ] if self.libraryType == LibraryType.WES: - tpl = self.base_out_lib + "antitarget.cnn" + tpl = self.base_out_lib + "antitargetcoverage.cnn" antitargets = [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() ] else: antitargets = [] - return {"normals": targets + antitargets} + return { + "reference": self.w_config.static_data_config.reference.path, + "normals": targets + antitargets, + } def _get_args_create_panel(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + edge = self.cfg.edge if self.cfg.edge is not None else self.is_wes + args = { "cluster": self.cfg.cluster, "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, - "no-edge": not self.cfg.get("edge", self.is_wes), + "no-edge": not edge, "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.cluster: @@ -812,13 +890,13 @@ def _get_args_create_panel(self, wildcards: Wildcards, input: InputFiles) -> dic return args def _get_input_files_sex(self, wildcards: Wildcards) -> dict[str, str]: - tpl = self.base_out_lib + "target.cnn" + tpl = self.base_out_lib + "targetcoverage.cnn" coverages = [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() ] if self.is_wes: - tpl = self.base_out_lib + "antitarget.cnn" + tpl = self.base_out_lib + "antitargetcoverage.cnn" coverages += [ tpl.format(mapper=wildcards["mapper"], library_name=x) for x in self.normal_libraries.keys() @@ -826,7 +904,25 @@ def _get_input_files_sex(self, wildcards: Wildcards) -> dict[str, str]: return {"coverages": coverages} def _get_args_sex(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | {"diploid-parx-genome": self.cfg.diploid_parx_genome} + return {"diploid-parx-genome": self.cfg.diploid_parx_genome} + + def _get_input_files_metrics(self, wildcards: Wildcards) -> dict[str, str]: + """Input for metrics report. Using coverage rather than ratios, and no segments""" + tpl = self.base_out_lib + "targetcoverage.cnn" + coverages = [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + if self.is_wes: + tpl = self.base_out_lib + "antitargetcoverage.cnn" + coverages += [ + tpl.format(mapper=wildcards["mapper"], library_name=x) + for x in self.normal_libraries.keys() + ] + return {"ratios": coverages} + + def _get_args_metrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + return {"drop-low-coverage": self.cfg.drop_low_coverage} def _read_autobin_output(self, filename: str) -> int: nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py index 59684d40d..78556154c 100644 --- a/snappy_pipeline/workflows/panel_of_normals/model.py +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -40,8 +40,19 @@ class CnvKit(CnvkitGeneric): path_access: str | None = None """Overrides access when not None""" - ignore_chroms: list[str] = [] - """Additional contigs to ignore""" + ignore_chroms: Annotated[ + list[str], + Field( + examples=[ + "chrM", + "MT", + "*_random", + "chrUn_*", + "GL*", + ] + ), + ] = [] + """Additional contigs to ignore, specific to the tool""" class GenomeName(enum.StrEnum): @@ -101,7 +112,6 @@ class PanelOfNormals(SnappyStepModel, validators.ToolsMixin): "NC_007605", "hs37d5", "chrEBV", - "*_decoy", "HLA-*", "GL000220.*", "chrEBV", diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 1e5a4d097..8120483ca 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -64,7 +64,7 @@ - The ratios (extension ``.cnr``) contains the ratio of expected coverage between tumor and the reference in each bin, or logarithmic scale. This can be used to examine the data or experiment with different segmentation algorithms. -- The segments (extension ``.segments.cns``) contains the output of the segmentation. A single log2 ratio value is +- The segments (extension ``.cns``) contains the output of the segmentation. A single log2 ratio value is attributed to each segment. The segmentation covers most of the part of the genome accessible to mapping. - The calls (extension ``calls.cns``) contains only the non-diploid segments, called after thresholding. @@ -95,17 +95,25 @@ Overview -------- -``cnvkit`` was designed to call CNV on whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), +``cnvkit`` is a set of tools originally designed to call somatic copy number alterations from exome data. +Its design is modular, which enables its use for whole genome and amplicon data. + +Because it was designed primarily for whole exome data. It has the concept of _targets_ (the regions enriched by the exome kit), and the _antitargets_ (those regions accessible for CNV calling, but outside of enrichment). The coverage of _targets_ and _antitargets_ are expected to be very different, but there is still information to be gained in the _antitarget_ regions, albeit at a much lower resolution than for _target_ regions. -``cnvkit`` was later used with some success on whole genome data. -WGS data was defined as _target_ regions covering the whole genome, with empty _antitarget_ regions. +For WGS data, the _target_ regions generally cover the whole accessible genome, with empty _antitarget_ regions. .. tip:: For Agilent kits, the ``cnvkit`` authors recommend to use baits as targets. Baits are in the ``*_Covered.bed`` file. +.. note:: + + ``cnvkit`` provides a tool to encapsulate common practice workflows (``batch``), depending on the type of data, and on the availability of optional inputs. + The actual workflow to generate this reference is slightly different between exome and whole genome data. + The current implementation recapitulates the common practice, while still dispaching computations on multiple cluster nodes. + --------------------------------- Regions accessible to CNV calling --------------------------------- @@ -122,6 +130,16 @@ 3. Otherwise, the pipeline creates the accessible regions file from the reference genome sequence, by removing only parts masked with ``N``. +.. note:: + Some short contigs (the mitochondrion, unplaced & unlocalized contigs, decoys, viral sequences, ...) are too short + for a reliable estimation of copy number changes. + ``snappy`` provides a generic way to ignore those contigs during processing (post-mapping), through the ``ignore_chroms`` configuration option. + This parameter is generally set at the _step_ level, typically ignoring decoys, HLA variants and viral sequences. + This is suitable for small variant calling (calling variants for genes on unplaced/unlocalized contigs is fine), + but not for CNA calling. + Therefore, ``ignore_chroms`` can also be set at the _tool_ level in the configuration. In that case, contigs from both levels will be ignored. + External panels (``cohort`` or ``file``) may alreay have been generated on a restricted set of contigs. + ----------------------- The reference coverage ----------------------- @@ -144,6 +162,11 @@ 4. ``flat``: a _flat_ reference is computed, which discards locus-specific effects. It should only be used when there are no normals nor suitable panel, or for benchmarking purposes. +.. warning:: + + When selecting ``cohort`` or ``file`` panels, remember that those might have been generated on a subset of contigs, + if ``ignore_chroms`` was used during the panel creation + ------------------ WGS-specific notes ------------------ @@ -234,6 +257,7 @@ class CnvKitStepPart(SomaticCnvCallingStepPart): #: Class available actions actions = ( + "ignore", "access", "autobin", "target", @@ -401,6 +425,8 @@ def get_output_files(self, action: str): output_files = {} match action: + case "ignore": + output_files = {"ignore_chroms": self.base_out + "ignored.bed"} case "access": output_files = {"access": self.base_out + "access.bed"} case "autobin": @@ -413,7 +439,9 @@ def get_output_files(self, action: str): case "antitarget": output_files = {"antitarget": self.base_out + "antitarget.bed"} case "coverage": - output_files = {"coverage": self.base_out_lib + "{region,(target|antitarget)}.cnn"} + output_files = { + "coverage": self.base_out_lib + "{region,(target|antitarget)}coverage.cnn" + } case "reference": if self.paired: output_files = {"reference": self.base_out_lib + "reference.cnn"} @@ -423,17 +451,17 @@ def get_output_files(self, action: str): output_files = {"ratios": self.base_out_lib + "cnr"} case "segment": output_files = { - "segments": self.base_out_lib + "segments.cns", + "segments": self.base_out_lib + "cns", "dataframe": self.base_out_lib + "rds", } case "call": - output_files = {"calls": self.base_out_lib + "calls.cns"} + output_files = {"calls": self.base_out_lib + "call.cns"} case "bintest": output_files = {"tests": self.base_out_lib + "bintest.cns"} case "metrics": output_files = {"report": base_report_lib + "metrics.tsv"} case "segmetrics": - output_files = {"report": base_report_lib + "segmetrics.tsv"} + output_files = {"report": base_report_lib + "segmetrics.cns"} case "genemetrics": output_files = {"report": base_report_lib + "genemetrics.tsv"} case "scatter": @@ -454,7 +482,7 @@ def get_log_file(self, action): base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." base_log_lib = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.{library_name}." - if action in ("access", "antitarget"): + if action in ("ignore", "access", "antitarget"): tpl = base_log + action elif action in ( "autobin", @@ -512,7 +540,7 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: result_files = [] - for suffix in ("cnr", "segments.cns", "calls.cns", "bintest.cns"): + for suffix in ("cnr", "cns", "call.cns", "bintest.cns"): result_files.append(base_out_lib + suffix) actions_to_log = ("fix", "segment", "call", "bintest") @@ -525,9 +553,9 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: ] # Logs of metrics not linked - for report in ("metrics", "segmetrics", "genemetrics"): + for report, ext in (("metrics", "tsv"), ("segmetrics", "cns"), ("genemetrics", "tsv")): if self.cfg.get(report).get("enabled"): - result_files.append(base_report_lib + report + ".tsv") + result_files.append(base_report_lib + report + "." + ext) # Logs of plots not links # TODO: Mouse date: only chromosomes 1 to 19 @@ -546,11 +574,27 @@ def get_result_files(self, library_name: str, mapper: str) -> list[str]: result_files += [x + ".md5" for x in result_files] return result_files + # ----- Ignore (generates the bed file of ignored contigs to exclude in access) --------------- + + def _get_input_files_ignore(self, wildcards: Wildcards) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"reference": self.w_config.static_data_config.reference.path} + + def _get_args_ignore(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: + assert self.create_access, "No need for ignored chroms, access already exists" + return {"ignore_chroms": list(self.ignored)} + # ----- Access -------------------------------------------------------------------------------- def _get_input_files_access(self, wildcards: Wildcards) -> dict[str, str]: - assert self.create_access, "Should not build access, already available" - return {} + assert self.create_access, "Access shouldn't be created, already available" + input_files = { + "reference": self.w_config.static_data_config.reference.path, + "exclude": self.cfg.access.exclude, + } + if self.ignored: + input_files["ignore_chroms"] = self.base_out.format(**wildcards) + "ignored.bed" + return input_files def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: """ @@ -563,13 +607,8 @@ def _get_args_access(self, wildcards: Wildcards, input: InputFiles) -> dict[str, This happens when the average target size is set in the config in WGS, or for WES. """ - assert self.create_access, "Should not build access, already available" - return dict(input) | { - "reference": self.w_config.static_data_config.reference.path, - "min-gap-size": self.cfg.access.min_gap_size, - "exclude": self.cfg.access.exclude, - "ignore_chroms": list(self.ignored), - } + assert self.create_access, "Access shouldn't be created, already available" + return {"min-gap-size": self.cfg.access.min_gap_size} # ----- Autobin (never used directly, only to compute target size in WGS settings) ------------ @@ -589,19 +628,19 @@ def _get_input_files_autobin(self, wildcards: Wildcards) -> dict[str, str]: input_files["access"] = self.base_out.format(**wildcards) + "access.bed" else: input_files["target"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["target"] = self.cfg.path_access return input_files def _get_args_autobin(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert ( self.compute_avg_target_size ), "Trying to estimate average target size for non-WGS samples" - args = dict(input) | {"bp-per-bin": 50000} + args = {"bp-per-bin": 50000} if self.plain_access: args["method"] = "wgs" else: args["method"] = "amplicon" - if "target" not in args: - args["target"] = self.cfg.path_access return args # ----- Target -------------------------------------------------------------------------------- @@ -623,7 +662,11 @@ def _get_input_files_target(self, wildcards: Wildcards) -> dict[str, str]: if self.create_access: input_files["interval"] = self.base_out.format(**wildcards) + "access.bed" if self.compute_avg_target_size: - input_files["avg-size"] = self.base_out_lib.format(**wildcards) + "autobin.txt" + input_files["avg_size"] = self.base_out_lib.format(**wildcards) + "autobin.txt" + else: + input_files["interval"] = self.path_baits + if self.w_config.static_data_config.get("features", None): + input_files["annotate"] = self.w_config.static_data_config.features.path return input_files def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: @@ -632,19 +675,17 @@ def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, args = { "avg-size": self.cfg.target.avg_size, "split": self.cfg.target.split, - "interval": self.path_baits, } else: assert self.is_wgs, "Panel not implemented yet" - args = dict(input) | {"split": self.cfg.target.split} - if args.get("avg-size", None) is not None: - args["avg-size"] = self._read_autobin_output(args["avg-size"]) + args = {"split": self.cfg.target.split} + if input.get("avg_size", None) is not None: + args["avg-size"] = self._read_autobin_output(input.get("avg_size")) elif self.cfg.target.avg_size is not None: args["avg-size"] = self.cfg.target.avg_size else: args["avg-size"] = 5000 if self.w_config.static_data_config.get("features", None): - args["annotate"] = self.w_config.static_data_config.features.path args["short-names"] = self.cfg.target.short_names return args @@ -653,16 +694,17 @@ def _get_args_target(self, wildcards: Wildcards, input: InputFiles) -> dict[str, def _get_input_files_antitarget(self, wildcards: Wildcards) -> dict[str, str]: input_files = {"target": self.base_out.format(**wildcards) + "target.bed"} if self.create_access: - input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + if not self.plain_access: + input_files["access"] = self.base_out.format(**wildcards) + "access.bed" + else: + input_files["access"] = self.cfg.path_access return input_files def _get_args_antitarget(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "avg-size": self.cfg.antitarget.avg_size, "min-size": self.cfg.antitarget.min_size, } - if "access" not in args: - args["access"] = self.cfg.path_access return args # ----- Coverage ------------------------------------------------------------------------------ @@ -682,6 +724,7 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: ngs_mapping = self.parent.sub_workflows["ngs_mapping"] base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) input_files = { + "reference": self.w_config.static_data_config.reference.path, "bam": ngs_mapping(base_path + ".bam"), "bai": ngs_mapping(base_path + ".bam.bai"), } @@ -706,32 +749,30 @@ def _get_input_files_coverage(self, wildcards: Wildcards) -> dict[str, str]: **wildcards ) input_files["intervals"] = panel_of_normals(base_path) + else: + intervals = self.cfg.panel_of_normals.get("path_{region}".format(**wildcards), "") + assert intervals != "", "Missing path to {region}".format(**wildcards) + input_files["intervals"] = intervals return input_files def _get_args_coverage(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + return { "min-mapq": self.cfg.coverage.min_mapq, "count": self.cfg.coverage.count, } - if "intervals" not in args: - intervals = self.cfg.panel_of_normals.get("path_{region}".format(**wildcards), "") - assert intervals != "", "Missing path to {region}".format(**wildcards) - args["intervals"] = intervals - return args # ----- Reference (flat or pairwise) ---------------------------------------------------------- def _get_input_files_reference(self, wildcards: Wildcards) -> dict[str, str]: """Builds reference from the paired normal, or flat prior in absence of normal""" assert self.build_ref, "Should not build reference" - input_files = {} + input_files = {"reference": self.w_config.static_data_config.reference.path} if self.paired: - input_files["normals"] = [self.base_out_lib.format(**wildcards) + "target.cnn"] + input_files["normals"] = [self.base_out_lib.format(**wildcards) + "targetcoverage.cnn"] if self.is_wes: input_files["normals"].append( - self.base_out_lib.format(**wildcards) + "antitarget.cnn" + self.base_out_lib.format(**wildcards) + "antitargetcoverage.cnn" ) elif self.pon_source == PanelOfNormalsOrigin.FLAT: input_files["target"] = self.base_out.format(**wildcards) + "target.bed" @@ -741,12 +782,12 @@ def _get_input_files_reference(self, wildcards: Wildcards) -> dict[str, str]: def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: assert self.build_ref, "Should not build reference" - args = dict(input) | { - "reference": self.w_config.static_data_config.reference.path, + edge = self.cfg.edge if self.cfg.edge is not None else self.is_wes + args = { "cluster": self.cfg.cluster, "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, - "no-edge": not self.cfg.get("edge", self.is_wes), + "no-edge": not edge, "diploid-parx-genome": self.cfg.diploid_parx_genome, } if self.cfg.cluster: @@ -766,9 +807,11 @@ def _get_args_reference(self, wildcards: Wildcards, input: InputFiles) -> dict[s def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: # Coverage on targets & optionally on antitargets - input_files = {"target": self.base_out_lib.format(**wildcards) + "target.cnn"} + input_files = {"target": self.base_out_lib.format(**wildcards) + "targetcoverage.cnn"} if self.is_wes: - input_files["antitarget"] = self.base_out_lib.format(**wildcards) + "antitarget.cnn" + input_files["antitarget"] = ( + self.base_out_lib.format(**wildcards) + "antitargetcoverage.cnn" + ) if self.paired: tpl = "{mapper}.cnvkit.{normal_library}".format( mapper=wildcards["mapper"], @@ -783,31 +826,31 @@ def _get_input_files_fix(self, wildcards: Wildcards) -> dict[str, str]: **wildcards ) input_files["reference"] = panel_of_normals(base_path) + else: + input_files["reference"] = self.cfg.panel_of_normals.path_panel_of_normals return input_files def _get_args_fix(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + edge = self.cfg.edge if self.cfg.edge is not None else self.is_wes + return { "cluster": self.cfg.cluster, "no-gc": not self.cfg.gc, "no-rmask": not self.cfg.rmask, - "no-edge": not self.cfg.get("edge", self.is_wes), + "no-edge": not edge, "diploid-parx-genome": self.cfg.diploid_parx_genome, + "sample-id": wildcards.library_name, } - args["sample-id"] = wildcards.library_name - if "reference" not in args: - args["reference"] = self.cfg.panel_of_normals.path_panel_of_normals - return args # ----- Variant-related convenience functions ------------------------------------------------- - def _variants_from_cohort_input(self) -> str: + def _variants_from_cohort_input(self, wildcards: Wildcards) -> str: variants = self.parent.sub_workflows["somatic_variant_calling_cnvkit"] tpl = f"{{mapper}}.{self.cfg.somatic_variant_calling.tool}.{{library_name}}" base_path = os.path.join("output", tpl, "out", tpl + ".vcf.gz") - return variants(base_path) + return variants(base_path).format(**wildcards) - def _variants_args(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + def _variants_args(self, wildcards: Wildcards) -> dict[str, str]: + args = { "min-variant-depth": self.cfg.somatic_variant_calling.min_variant_depth, "sample-id": wildcards.library_name, "normal-id": self.parent.matched_normal[wildcards.library_name], @@ -822,13 +865,18 @@ def _get_input_files_segment(self, wildcards: Wildcards) -> dict[str, str]: # Coverage input_files = {"ratios": self.base_out_lib.format(**wildcards) + "cnr"} # Segmentation using SNVs from cohort - if self.variants_from_cohort: - input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + if self.cfg.somatic_variant_calling.enabled: + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input(wildcards) + else: + input_files["variants"] = ( + self.cfg.somatic_variant_calling.path_somatic_variant_calling + ) return input_files def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Segmentation parameters - args = dict(input) | { + args = { "method": self.cfg.segment.method, "threshold": self.cfg.segment.threshold, "drop-outliers": self.cfg.segment.drop_outliers, @@ -837,19 +885,26 @@ def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str if self.cfg.segment.method == CnvkitSegmentationMethod.CBS: args["smooth-cbs"] = self.cfg.segment.smooth_cbs if self.cfg.somatic_variant_calling.enabled: - args |= self._variants_args(wildcards, input) - if "variants" not in args: - args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args |= self._variants_args(wildcards) return args # ----- Call ---------------------------------------------------------------------------------- def _get_input_files_call(self, wildcards: Wildcards) -> dict[str, str]: # Segmentation - input_files = {"segments": self.base_out_lib.format(**wildcards) + "segments.cns"} + if self.cfg.segmetrics.enabled: + tpl = "{mapper}.cnvkit.{library_name}".format(**wildcards) + input_files = {"segments": os.path.join("work", tpl, "report", tpl) + ".segmetrics.cns"} + else: + input_files = {"segments": self.base_out_lib.format(**wildcards) + "cns"} # Segmentation using SNVs from cohort - if self.variants_from_cohort: - input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + if self.cfg.somatic_variant_calling.enabled: + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input(wildcards) + else: + input_files["variants"] = ( + self.cfg.somatic_variant_calling.path_somatic_variant_calling + ) # Purity from the tool if ( self.cfg.somatic_purity_ploidy_estimate.enabled @@ -863,7 +918,7 @@ def _get_input_files_call(self, wildcards: Wildcards) -> dict[str, str]: def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: # Call parameters - args = dict(input) | { + args = { "method": self.cfg.call.method, "thresholds": self.cfg.call.thresholds, "drop-low-coverage": self.cfg.drop_low_coverage, @@ -875,10 +930,10 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s else: if self.cfg.call.center is not None: args["center"] = self.cfg.call.center + if self.cfg.call.filter is None: + args["filter"] = ["ci"] if self.cfg.somatic_variant_calling.enabled: - args |= self._variants_args(wildcards, input) - if "variants" not in args: - args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args |= self._variants_args(wildcards) # Sample sex if known, otherwise guessed by the tool sample_sex = self._get_sample_sex(wildcards.library_name) if sample_sex is not None: @@ -887,8 +942,8 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s args["male-reference"] = True # If requested, purity from samplesheet or from default if self.cfg.somatic_purity_ploidy_estimate.enabled: - if args.get("purity_file", None) is not None: - (purity, ploidy) = self._read_purity_ploidy_output(args["purity_file"]) + if input.get("purity_file", None) is not None: + (purity, ploidy) = self._read_purity_ploidy_output(input.get("purity_file")) elif self.cfg.somatic_purity_ploidy_estimate.source == PurityOrigin.SAMPLESHEET: purity = self.tumors[wildcards.library_name].purity ploidy = self.tumors[wildcards.library_name].ploidy @@ -904,11 +959,11 @@ def _get_args_call(self, wildcards: Wildcards, input: InputFiles) -> dict[str, s def _get_input_files_bintest(self, wildcards: Wildcards) -> dict[str, str]: return { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } def _get_args_bintest(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | { + return { "alpha": self.cfg.bintest.alpha, "target": self.cfg.bintest.target, } @@ -918,14 +973,19 @@ def _get_args_bintest(self, wildcards: Wildcards, input: InputFiles) -> dict[str def _get_input_files_scatter(self, wildcards: Wildcards) -> dict[str, str]: input_files = { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } - if self.variants_from_cohort: - input_files["variants"] = self._variants_from_cohort_input().format(**wildcards) + if self.cfg.somatic_variant_calling.enabled: + if self.variants_from_cohort: + input_files["variants"] = self._variants_from_cohort_input(wildcards) + else: + input_files["variants"] = ( + self.cfg.somatic_variant_calling.path_somatic_variant_calling + ) return input_files def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "antitarget-marker": self.cfg.scatter.antitarget_marker, "by-bin": self.cfg.scatter.by_bin, "segment-color": self.cfg.scatter.segment_color, @@ -940,9 +1000,7 @@ def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str if wildcards["contig_name"] != "all": args["chromosome"] = wildcards["contig_name"] if self.cfg.somatic_variant_calling.enabled: - args |= self._variants_args(wildcards, input) - if "variants" not in args: - args["variants"] = self.cfg.somatic_variant_calling.path_somatic_variant_calling + args |= self._variants_args(wildcards) args["title"] = f"{wildcards['library_name']} - {wildcards['contig_name']}" return args @@ -950,21 +1008,21 @@ def _get_args_scatter(self, wildcards: Wildcards, input: InputFiles) -> dict[str def _get_input_files_metrics(self, wildcards: Wildcards) -> dict[str, str]: return { - "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "ratios": [self.base_out_lib.format(**wildcards) + "cnr"], + "segments": [self.base_out_lib.format(**wildcards) + "cns"], } def _get_args_metrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | {"drop-low-coverage": self.cfg.drop_low_coverage} + return {"drop-low-coverage": self.cfg.drop_low_coverage} def _get_input_files_segmetrics(self, wildcards: Wildcards) -> dict[str, str]: return { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } def _get_args_segmetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - return dict(input) | { + return { "drop-low-coverage": self.cfg.drop_low_coverage, "alpha": self.cfg.segmetrics.alpha, "bootstrap": self.cfg.segmetrics.bootstrap, @@ -975,11 +1033,11 @@ def _get_args_segmetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[ def _get_input_files_genemetrics(self, wildcards: Wildcards) -> dict[str, str]: return { "ratios": self.base_out_lib.format(**wildcards) + "cnr", - "segments": self.base_out_lib.format(**wildcards) + "segments.cns", + "segments": self.base_out_lib.format(**wildcards) + "cns", } def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict[str, str]: - args = dict(input) | { + args = { "drop-low-coverage": self.cfg.drop_low_coverage, "male-reference": self.cfg.male_reference, "threshold": self.cfg.genemetrics.threshold, @@ -1001,7 +1059,7 @@ def _get_args_genemetrics(self, wildcards: Wildcards, input: InputFiles) -> dict def _read_autobin_output(self, filename: str) -> int: nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" pattern = re.compile("^Target:[ \t]+" + nb + "[ \t]+" + nb + "$") - with open(filename) as f: + with open(filename, "rt") as f: for line in f: m = pattern.match(line) if m: @@ -1012,7 +1070,7 @@ def _read_purity_ploidy_output(self, filename: str) -> tuple[float, float]: # TODO: Tool-dependent parsing of purity/ploidy file nb = r"([+-]?(\d+(\.\d*)?|\.\d+)([EeDd][+-]?[0-9]+)?)" pattern = re.compile("^Purity/ploidy:[ \t]+" + nb + "[ \t]+" + nb + "$") - with open(filename) as f: + with open(filename, "rt") as f: for line in f: m = pattern.match(line) if m: diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules index cfaf75206..a6a343e24 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules +++ b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules @@ -1,4 +1,25 @@ +rule somatic_cnv_calling_cnvkit_ignore: + input: + unpack(wf.get_input_files("cnvkit", "ignore")), + output: + **wf.get_output_files("cnvkit", "ignore"), + params: + **{"args": wf.get_args("cnvkit", "ignore")}, + log: + **wf.get_log_file("cnvkit", "ignore"), + threads: wf.get_resource("cnvkit", "ignore", "threads") + resources: + time=wf.get_resource("cnvkit", "ignore", "time"), + memory=wf.get_resource("cnvkit", "ignore", "memory"), + partition=wf.get_resource("cnvkit", "ignore", "partition"), + tmpdir=wf.get_resource("cnvkit", "ignore", "tmpdir"), + wrapper: + wf.wrapper_path("cnvkit/ignore") + + rule somatic_cnv_calling_cnvkit_access: + input: + unpack(wf.get_input_files("cnvkit", "access")), output: **wf.get_output_files("cnvkit", "access"), params: diff --git a/snappy_wrappers/tools/chromosome_lengths.py b/snappy_wrappers/tools/chromosome_lengths.py new file mode 100644 index 000000000..fd15b0665 --- /dev/null +++ b/snappy_wrappers/tools/chromosome_lengths.py @@ -0,0 +1,850 @@ +"""Human & mouse chromosome lengths for different primary assemblies + +Might be useful to check genome validity, chromosome nameing conventions, ... +""" + +CHROMOSOME_LENGTHS = { + "GRCh37": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000001.10", + "GENBANK": "CM000663 .1", + "Length": 249250621, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000002.11", + "GENBANK": "CM000664 .1", + "Length": 243199373, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000003.11", + "GENBANK": "CM000665 .1", + "Length": 198022430, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000004.11", + "GENBANK": "CM000666 .1", + "Length": 191154276, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000005.9", + "GENBANK": "CM000667 .1", + "Length": 180915260, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000006.11", + "GENBANK": "CM000668 .1", + "Length": 171115067, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000007.13", + "GENBANK": "CM000669 .1", + "Length": 159138663, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000008.10", + "GENBANK": "CM000670 .1", + "Length": 146364022, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000009.11", + "GENBANK": "CM000671 .1", + "Length": 141213431, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000010.10", + "GENBANK": "CM000672 .1", + "Length": 135534747, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000011.9", + "GENBANK": "CM000673 .1", + "Length": 135006516, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000012.11", + "GENBANK": "CM000674 .1", + "Length": 133851895, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000013.10", + "GENBANK": "CM000675 .1", + "Length": 115169878, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000014.8", + "GENBANK": "CM000676 .1", + "Length": 107349540, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000015.9", + "GENBANK": "CM000677 .1", + "Length": 102531392, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000016.9", + "GENBANK": "CM000678 .1", + "Length": 90354753, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000017.10", + "GENBANK": "CM000679 .1", + "Length": 81195210, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000018.9", + "GENBANK": "CM000680 .1", + "Length": 78077248, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000019.9", + "GENBANK": "CM000681 .1", + "Length": 59128983, + }, + { + "UCSC": "chr20", + "ENSEMBL": "20", + "REFSEQ": "NC_000020.10", + "GENBANK": "CM000682 .1", + "Length": 63025520, + }, + { + "UCSC": "chr21", + "ENSEMBL": "21", + "REFSEQ": "NC_000021.8", + "GENBANK": "CM000683 .1", + "Length": 48129895, + }, + { + "UCSC": "chr22", + "ENSEMBL": "22", + "REFSEQ": "NC_000022.10", + "GENBANK": "CM000684 .1", + "Length": 51304566, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000023.10", + "GENBANK": "CM000685 .1", + "Length": 155270560, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000024.9", + "GENBANK": "CM000686 .1", + "Length": 59373566, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_012920.1", + "GENBANK": "J01415.2", + "Length": 16569, + }, + ], + "GRCh38": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000001.11", + "GENBANK": "CM000663 .2", + "Length": 248956422, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000002.12", + "GENBANK": "CM000664 .2", + "Length": 242193529, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000003.12", + "GENBANK": "CM000665 .2", + "Length": 198295559, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000004.12", + "GENBANK": "CM000666 .2", + "Length": 190214555, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000005.10", + "GENBANK": "CM000667 .2", + "Length": 181538259, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000006.12", + "GENBANK": "CM000668 .2", + "Length": 170805979, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000007.14", + "GENBANK": "CM000669 .2", + "Length": 159345973, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000008.11", + "GENBANK": "CM000670 .2", + "Length": 145138636, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000009.12", + "GENBANK": "CM000671 .2", + "Length": 138394717, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000010.11", + "GENBANK": "CM000672 .2", + "Length": 133797422, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000011.10", + "GENBANK": "CM000673 .2", + "Length": 135086622, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000012.12", + "GENBANK": "CM000674 .2", + "Length": 133275309, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000013.11", + "GENBANK": "CM000675 .2", + "Length": 114364328, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000014.9", + "GENBANK": "CM000676 .2", + "Length": 107043718, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000015.10", + "GENBANK": "CM000677 .2", + "Length": 101991189, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000016.10", + "GENBANK": "CM000678 .2", + "Length": 90338345, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000017.11", + "GENBANK": "CM000679 .2", + "Length": 83257441, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000018.10", + "GENBANK": "CM000680 .2", + "Length": 80373285, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000019.10", + "GENBANK": "CM000681 .2", + "Length": 58617616, + }, + { + "UCSC": "chr20", + "ENSEMBL": "20", + "REFSEQ": "NC_000020.11", + "GENBANK": "CM000682 .2", + "Length": 64444167, + }, + { + "UCSC": "chr21", + "ENSEMBL": "21", + "REFSEQ": "NC_000021.9", + "GENBANK": "CM000683 .2", + "Length": 46709983, + }, + { + "UCSC": "chr22", + "ENSEMBL": "22", + "REFSEQ": "NC_000022.11", + "GENBANK": "CM000684 .2", + "Length": 50818468, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000023.11", + "GENBANK": "CM000685 .2", + "Length": 156040895, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000024.10", + "GENBANK": "CM000686 .2", + "Length": 57227415, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_012920.1", + "GENBANK": "J01415.2", + "Length": 16569, + }, + ], + "GRCm38": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000067.6", + "GENBANK": "CM000994 .2", + "Length": 195471971, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000068.7", + "GENBANK": "CM000995 .2", + "Length": 182113224, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000069.6", + "GENBANK": "CM000996 .2", + "Length": 160039680, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000070.6", + "GENBANK": "CM000997 .2", + "Length": 156508116, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000071.6", + "GENBANK": "CM000998 .2", + "Length": 151834684, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000072.6", + "GENBANK": "CM000999 .2", + "Length": 149736546, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000073.6", + "GENBANK": "CM001000 .2", + "Length": 145441459, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000074.6", + "GENBANK": "CM001001 .2", + "Length": 129401213, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000075.6", + "GENBANK": "CM001002 .2", + "Length": 124595110, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000076.6", + "GENBANK": "CM001003 .2", + "Length": 130694993, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000077.6", + "GENBANK": "CM001004 .2", + "Length": 122082543, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000078.6", + "GENBANK": "CM001005 .2", + "Length": 120129022, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000079.6", + "GENBANK": "CM001006 .2", + "Length": 120421639, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000080.6", + "GENBANK": "CM001007 .2", + "Length": 124902244, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000081.6", + "GENBANK": "CM001008 .2", + "Length": 104043685, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000082.6", + "GENBANK": "CM001009 .2", + "Length": 98207768, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000083.6", + "GENBANK": "CM001010 .2", + "Length": 94987271, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000084.6", + "GENBANK": "CM001011 .2", + "Length": 90702639, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000085.6", + "GENBANK": "CM001012 .2", + "Length": 61431566, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000086.7", + "GENBANK": "CM001013 .2", + "Length": 171031299, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000087.7", + "GENBANK": "CM001014 .2", + "Length": 91744698, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_005089.1", + "GENBANK": "AY172335.1", + "Length": 16299, + }, + ], + "GRCm39": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_000067.7", + "GENBANK": "CM000994 .3", + "Length": 195154279, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_000068.8", + "GENBANK": "CM000995 .3", + "Length": 181755017, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_000069.7", + "GENBANK": "CM000996 .3", + "Length": 159745316, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_000070.7", + "GENBANK": "CM000997 .3", + "Length": 156860686, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_000071.7", + "GENBANK": "CM000998 .3", + "Length": 151758149, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_000072.7", + "GENBANK": "CM000999 .3", + "Length": 149588044, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_000073.7", + "GENBANK": "CM001000 .3", + "Length": 144995196, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_000074.7", + "GENBANK": "CM001001 .3", + "Length": 130127694, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_000075.7", + "GENBANK": "CM001002 .3", + "Length": 124359700, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_000076.7", + "GENBANK": "CM001003 .3", + "Length": 130530862, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_000077.7", + "GENBANK": "CM001004 .3", + "Length": 121973369, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_000078.7", + "GENBANK": "CM001005 .3", + "Length": 120092757, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_000079.7", + "GENBANK": "CM001006 .3", + "Length": 120883175, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_000080.7", + "GENBANK": "CM001007 .3", + "Length": 125139656, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_000081.7", + "GENBANK": "CM001008 .3", + "Length": 104073951, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_000082.7", + "GENBANK": "CM001009 .3", + "Length": 98008968, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_000083.7", + "GENBANK": "CM001010 .3", + "Length": 95294699, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_000084.7", + "GENBANK": "CM001011 .3", + "Length": 90720763, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_000085.7", + "GENBANK": "CM001012 .3", + "Length": 61420004, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_000086.8", + "GENBANK": "CM001013 .3", + "Length": 169476592, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_000087.8", + "GENBANK": "CM001014 .3", + "Length": 91455967, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_005089.1", + "GENBANK": "AY172335.1", + "Length": 16299, + }, + ], + "T2T-CHM13v2": [ + { + "UCSC": "chr1", + "ENSEMBL": "1", + "REFSEQ": "NC_060925.1", + "GENBANK": "CP068277.2", + "Length": 248387328, + }, + { + "UCSC": "chr2", + "ENSEMBL": "2", + "REFSEQ": "NC_060926.1", + "GENBANK": "CP068276.2", + "Length": 242696752, + }, + { + "UCSC": "chr3", + "ENSEMBL": "3", + "REFSEQ": "NC_060927.1", + "GENBANK": "CP068275.2", + "Length": 201105948, + }, + { + "UCSC": "chr4", + "ENSEMBL": "4", + "REFSEQ": "NC_060928.1", + "GENBANK": "CP068274.2", + "Length": 193574945, + }, + { + "UCSC": "chr5", + "ENSEMBL": "5", + "REFSEQ": "NC_060929.1", + "GENBANK": "CP068273.2", + "Length": 182045439, + }, + { + "UCSC": "chr6", + "ENSEMBL": "6", + "REFSEQ": "NC_060930.1", + "GENBANK": "CP068272.2", + "Length": 172126628, + }, + { + "UCSC": "chr7", + "ENSEMBL": "7", + "REFSEQ": "NC_060931.1", + "GENBANK": "CP068271.2", + "Length": 160567428, + }, + { + "UCSC": "chr8", + "ENSEMBL": "8", + "REFSEQ": "NC_060932.1", + "GENBANK": "CP068270.2", + "Length": 146259331, + }, + { + "UCSC": "chr9", + "ENSEMBL": "9", + "REFSEQ": "NC_060933.1", + "GENBANK": "CP068269.2", + "Length": 150617247, + }, + { + "UCSC": "chr10", + "ENSEMBL": "10", + "REFSEQ": "NC_060934.1", + "GENBANK": "CP068268.2", + "Length": 134758134, + }, + { + "UCSC": "chr11", + "ENSEMBL": "11", + "REFSEQ": "NC_060935.1", + "GENBANK": "CP068267.2", + "Length": 135127769, + }, + { + "UCSC": "chr12", + "ENSEMBL": "12", + "REFSEQ": "NC_060936.1", + "GENBANK": "CP068266.2", + "Length": 133324548, + }, + { + "UCSC": "chr13", + "ENSEMBL": "13", + "REFSEQ": "NC_060937.1", + "GENBANK": "CP068265.2", + "Length": 113566686, + }, + { + "UCSC": "chr14", + "ENSEMBL": "14", + "REFSEQ": "NC_060938.1", + "GENBANK": "CP068264.2", + "Length": 101161492, + }, + { + "UCSC": "chr15", + "ENSEMBL": "15", + "REFSEQ": "NC_060939.1", + "GENBANK": "CP068263.2", + "Length": 99753195, + }, + { + "UCSC": "chr16", + "ENSEMBL": "16", + "REFSEQ": "NC_060940.1", + "GENBANK": "CP068262.2", + "Length": 96330374, + }, + { + "UCSC": "chr17", + "ENSEMBL": "17", + "REFSEQ": "NC_060941.1", + "GENBANK": "CP068261.2", + "Length": 84276897, + }, + { + "UCSC": "chr18", + "ENSEMBL": "18", + "REFSEQ": "NC_060942.1", + "GENBANK": "CP068260.2", + "Length": 80542538, + }, + { + "UCSC": "chr19", + "ENSEMBL": "19", + "REFSEQ": "NC_060943.1", + "GENBANK": "CP068259.2", + "Length": 61707364, + }, + { + "UCSC": "chr20", + "ENSEMBL": "20", + "REFSEQ": "NC_060944.1", + "GENBANK": "CP068258.2", + "Length": 66210255, + }, + { + "UCSC": "chr21", + "ENSEMBL": "21", + "REFSEQ": "NC_060945.1", + "GENBANK": "CP068257.2", + "Length": 45090682, + }, + { + "UCSC": "chr22", + "ENSEMBL": "22", + "REFSEQ": "NC_060946.1", + "GENBANK": "CP068256.2", + "Length": 51324926, + }, + { + "UCSC": "chrX", + "ENSEMBL": "X", + "REFSEQ": "NC_060947.1", + "GENBANK": "CP068255.2", + "Length": 154259566, + }, + { + "UCSC": "chrY", + "ENSEMBL": "Y", + "REFSEQ": "NC_060948.1", + "GENBANK": "CP086569.2", + "Length": 62460029, + }, + { + "UCSC": "chrM", + "ENSEMBL": "MT", + "REFSEQ": "NC_012920.1", + "GENBANK": "J01415.2", + "Length": 16569, + }, + ], +} diff --git a/snappy_wrappers/tools/genome_windows.py b/snappy_wrappers/tools/genome_windows.py index 2236d6c42..691bf904f 100644 --- a/snappy_wrappers/tools/genome_windows.py +++ b/snappy_wrappers/tools/genome_windows.py @@ -16,7 +16,7 @@ import sys from pathlib import Path - +from typing import Iterator # The following is required for being able to import snappy_wrappers modules # inside wrappers. These run in an "inner" snakemake process which uses its @@ -88,23 +88,17 @@ def yield_regions(fai_file, window_size, subtract_end=0, ignore_chroms=None, pad begin = end -def ignore_chroms(path_ref: str, ignored: set[str] = [], return_ignored: bool = False): - path_ref = Path(path_ref).resolve() - if Path(str(path_ref) + ".fai").exists(): - contigs = _parse_index(Path(str(path_ref) + ".fai"), PATTERN_FAI) - elif Path(str(path_ref) + ".genome").exists(): - contigs = _parse_index(Path(str(path_ref) + ".genome"), PATTERN_GENOME) - elif path_ref.with_suffix("dict").exists(): - contigs = _parse_index(path_ref.with_suffix("dict"), PATTERN_DICT, True) - else: - contigs = _read_fasta(path_ref) - for contig_name, contig_length in contigs: - m = matches_any(contig_name, ignored) - if (m and return_ignored) or (not m and not return_ignored): - yield contig_name, contig_length +def yield_contigs_and_lengths( + filename: Path, pattern: re.Pattern, allow_mismatch: bool = False +) -> Iterator[tuple[str, int]]: + """Yields contig names & lengths from regex pattern matching of sequence dictionary records + :param filename: path to the sequence dictionary file (``*.fai``, ``*.genome`` or ``*.dict``) + :param pattern: regular expression pattern (compiled) to extract contig name & length from sequence dictionary record + :param allow_mismatch: when true, records that don't match the pattern are allowed, otherwise they raise an exception -def _parse_index(filename: Path, pattern: re.Pattern, allow_mismatch: bool = False): + :returns: An iterator giving sequence of names and lengths for all contigs + """ with open(filename, "rt") as f: for line in f: line = line.strip() @@ -119,7 +113,13 @@ def _parse_index(filename: Path, pattern: re.Pattern, allow_mismatch: bool = Fal raise ValueError(f"Unexpected record '{line}' in reference file '{filename}'") -def _read_fasta(filename: Path): +def yield_contigs_and_lengths_from_sequence(filename: Path) -> Iterator[tuple[str, int]]: + """Yields contig names & lengths from parsing the reference sequence + + :param filename: path to the reference sequence in ``fasta`` format + + :returns: An iterator giving sequence of names and lengths for all contigs + """ contig_name = None contig_length = None with open(filename, "rt") as f: @@ -140,6 +140,47 @@ def _read_fasta(filename: Path): yield contig_name, contig_length +def yield_contigs_and_lengths_from_ref(path_ref: str) -> Iterator[tuple[str, int]]: + """Yields all contig names & lengths in the reference sequence + + :param filename: path to the reference sequence + + :returns: An iterator giving sequence of names and lengths for all contigs + + The contig names & lengths are obtained from the sequence dictionary files when possible. + The order is ``*.fai``, ``*.genome``, ``*.dict`` (replacing the ``.fasta`` or ``.fa`` extension). + When none of these files in available, then the sequence itself is used. + + TODO: Add compressed files ``.gz`` & ``.bgz``. + """ + path_ref = Path(path_ref).resolve() + if Path(str(path_ref) + ".fai").exists(): + return yield_contigs_and_lengths(Path(str(path_ref) + ".fai"), PATTERN_FAI) + elif Path(str(path_ref) + ".genome").exists(): + return yield_contigs_and_lengths(Path(str(path_ref) + ".genome"), PATTERN_GENOME) + elif path_ref.with_suffix("dict").exists(): + return yield_contigs_and_lengths(path_ref.with_suffix("dict"), PATTERN_DICT, True) + else: + return yield_contigs_and_lengths_from_sequence(path_ref) + + +def ignore_chroms( + path_ref: str, ignored: set[str] = [], return_ignored: bool = False +) -> Iterator[tuple[str, int]]: + """Yields contig names & lengths belonging or excluding a set of patterns. + + :param filename: path to the reference sequence + :param ignored: set of patterns to identify contigs to be ignored + :param return_ignored: select which set of contigs to return, those which names don't match any pattern, or those which names match one pattern at least. + + :returns: An iterator giving sequence of names and lengths for all contigs to use or to ignore (depending on ``return_ignored``) + """ + for contig_name, contig_length in yield_contigs_and_lengths_from_ref(path_ref): + m = matches_any(contig_name, ignored) + if (m and return_ignored) or (not m and not return_ignored): + yield contig_name, contig_length + + def run(args): """Main entry point after parsing command line arguments""" yielded = 0 diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 4afbbaf99..33a743db2 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -10,36 +10,28 @@ base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) sys.path.insert(0, base_dir) -from snappy_wrappers.tools.genome_windows import ignore_chroms from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper __author__ = "Eric Blanc" __email__ = "eric.blanc@bih-charite.de" args = snakemake.params.get("args", {}) - -prefix = "" +exclude = args.get("exclude", []) # Add the "ignore_chrom" contents to the excluded regions -if len(args.get("ignore_chroms", [])) > 0: - ignored_contigs = ignore_chroms(args["reference"], args["ignore_chroms"], return_ignored=True) - lines = ["cat << __EOF > $TMPDIR/ignore_chroms.bed"] - for (contig_name, contig_length) in ignored_contigs: - lines.append(f"{contig_name}\t0\t{contig_length}") - lines.append("__EOF") - prefix = "\n".join(lines) + "\n" - args["exclude"].append("$TMPDIR/ignore_chroms.bed") +if snakemake.input.get("ignore_chroms", None) is not None: + exclude.append(snakemake.input.get("ignore_chroms")) cmd = r""" cnvkit.py access \ -o {snakemake.output.access} \ {min_gap_size} {exclude} \ - {args[reference]} + {snakemake.input.reference} """.format( snakemake=snakemake, args=args, min_gap_size=f"--min-gap-size {args['min-gap-size']}" if args.get("min-gap-size", None) is not None else "", - exclude=" ".join([f"--exclude {x}" for x in args["exclude"]]), + exclude=" ".join([f"--exclude {x}" for x in exclude]), ) -CnvkitWrapper(snakemake, prefix + cmd).run() +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 5fb01d78b..d70c9a297 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -21,11 +21,12 @@ cnvkit.py antitarget \ -o {snakemake.output.antitarget} \ --avg-size {args[avg-size]} {min_size} \ - --access {args[access]} \ - {args[target]} + {access} \ + {snakemake.input.target} """.format( snakemake=snakemake, args=args, + access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) is not None else "", min_size=f"--min-size {args['min-size']}" if args.get("min-size") is not None else "", ) diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index 5020ac227..5d9def018 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -22,15 +22,15 @@ {out_target} {out_antitarget} \ {access} {target} \ --bp-per-bin {args[bp-per-bin]} \ - {args[bams]} \ + {snakemake.input.bams} \ > {snakemake.output.result} """.format( snakemake=snakemake, args=args, out_target=f"--target-output-bed {snakemake.output.target}" if snakemake.output.get("target", "") != "" else "", out_antitarget=f"--antitarget-output-bed {snakemake.output.antitarget}" if snakemake.output.get("antitarget", "") != "" else "", - access=f"--access {args['access']}" if args.get("access", None) is not None else "", - target=f"--targets {args['target']}" if args.get("target", None) is not None else "", + access=f"--access {snakemake.input.access}" if snakemake.input.get("access", None) is not None else "", + target=f"--targets {snakemake.input.target}" if snakemake.input.get("target", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py index 0ea46cf30..cfca4ecbe 100644 --- a/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/bintest/wrapper.py @@ -20,9 +20,9 @@ cmd = r""" cnvkit.py bintest \ -o {snakemake.output.tests} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ --alpha {args[alpha]} {target} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index 7cf317aa6..757e1e909 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -2,7 +2,6 @@ """Wrapper vor cnvkit.py call""" import os -import re import sys # The following is required for being able to import snappy_wrappers modules @@ -15,12 +14,9 @@ args = snakemake.params.get("args", {}) -PATTERN = re.compile("^(Purity|Ploidy): +([+-]?([0-9]+(\.[0-9]*)?|\.[0-9]+)([EeDd][+-]?[0-9]+)?) *$") - - -if "variants" in args: +if snakemake.input.get("variants", None) is not None: variants = r""" - ---vcf {args[variants]} \ + ---vcf {snakemake.input.variants} \ --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( @@ -39,7 +35,7 @@ {center} {center_at} {drop_low_coverage} {sample_sex} {male_reference} {diploid_parx_genome} \ {purity} {ploidy} \ {variants} \ - {args[segments]} + {snakemake.input.segments} """.format( snakemake=snakemake, args=args, @@ -47,7 +43,7 @@ purity=f"--purity {args['purity']}" if args.get("purity", None) is not None else "", ploidy=f"--ploidy {args['ploidy']}" if args.get("ploidy", None) is not None else "", thresholds="--thresholds={}".format(",".join(map(str, args["thresholds"]))) if len(args.get("thresholds", [])) > 0 else "", - filter=f"--filter {args['filter']}" if args.get("filter", None) is not None else "", + filter="--filter {}".format(" ".join(args["filter"])) if len(args.get("filter", [])) > 0 else "", center=f"--center {args['center']}" if args.get("center", None) is not None else "", center_at=f"--center-at {args['center-at']}" if args.get("center-at", None) is not None else "", drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index 303127160..569da948f 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -20,9 +20,9 @@ cmd = r""" cnvkit.py coverage --processes {snakemake.resources._cores} \ -o {snakemake.output.coverage} \ - --fasta {args[reference]} \ + --fasta {snakemake.input.reference} \ --min-mapq {args[min-mapq]} {count} \ - {args[bam]} {args[intervals]} + {snakemake.input.bam} {snakemake.input.intervals} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index a5b56c8e4..a79f2c978 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -18,20 +18,23 @@ args = snakemake.params.get("args", {}) # Fix requires empty antitarget file in WGS & Panel modes -create_dummy_antitarget = "" -if args.get("antitarget", "") == "": - args["antitarget"] = "$TMPDIR/antitarget.bed" - create_dummy_antitarget = f"touch {args['antitarget']} ; " +if snakemake.input.get("antitarget", None) is None: + antitarget = "$TMPDIR/antitarget.bed" + create_dummy_antitarget = f"touch {antitarget} ; " +else: + antitarget = snakemake.input.antitarget + create_dummy_antitarget = "" cmd = r""" cnvkit.py fix \ -o {snakemake.output.ratios} \ {cluster} --sample-id {args[sample-id]} \ {no_gc} {no_edge} {no_rmask} \ - {args[target]} {args[antitarget]} {args[reference]} + {snakemake.input.target} {antitarget} {snakemake.input.reference} """.format( snakemake=snakemake, args=args, + antitarget=antitarget, cluster="--cluster" if args.get("cluster", False) else "", no_gc="--no-gc" if args.get("no-gc", False) else "", no_edge="--no-edge" if args.get("no-edge", False) else "", diff --git a/snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml b/snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/ignore/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py b/snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py new file mode 100644 index 000000000..10743b88b --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/ignore/wrapper.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py access""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snappy_wrappers.tools.genome_windows import ignore_chroms +from snappy_wrappers.wrappers.cnvkit.cnvkit_wrapper import CnvkitWrapper + +__author__ = "Eric Blanc" +__email__ = "eric.blanc@bih-charite.de" + +args = snakemake.params.get("args", {}) + +ignored_contigs = ignore_chroms(snakemake.input.reference, args["ignore_chroms"], return_ignored=True) +lines = [] +for (contig_name, contig_length) in ignored_contigs: + lines.append(f"{contig_name}\t0\t{contig_length}") +lines = "\n".join(lines) + +cmd = f""" +cat << __EOF > {snakemake.output.ignore_chroms} +{lines} +__EOF +""" + +CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py index c4b475f1b..bd3f14201 100644 --- a/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/plot/scatter/wrapper.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """Wrapper for cnvkit.py scatter""" +import csv import os -import re import sys # The following is required for being able to import snappy_wrappers modules @@ -15,9 +15,33 @@ args = snakemake.params.get("args", {}) -if "variants" in args: +# Fix chromosome name prefix +if args.get("chromosome", None) is not None: + chromosome = args["chromosome"] + if chromosome.startswith("chr"): + ucsc = chromosome + ensembl = chromosome[3:] + if ensembl == "M": + ensembl = "MT" + else: + ucsc = f"chr{chromosome}" + ensembl = chromosome + if ucsc == "chrMT": + ucsc = "chrM" + + with open(snakemake.input.segments, "rt") as f: + reader = csv.DictReader(f, delimiter="\t") + for record in reader: + if ucsc == record["chromosome"]: + args["chromosome"] = ucsc + break + if ensembl == record["chromosome"]: + args["chromosome"] = ensembl + break + +if snakemake.input.get("variants", None) is not None: variants = r""" - ---vcf {args[variants]} \ + ---vcf {snakemake.input.variants} \ --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( @@ -31,21 +55,21 @@ cmd = r""" cnvkit.py scatter \ -o {snakemake.output.plot} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ {chromosome} {gene} {range_list} \ --width {args[width]} \ --antitarget-marker {args[antitarget-marker]} --segment-color {args[segment-color]} \ {by_bin} {trend} --title "{args[title]}" \ {y_min} {y_max} {fig_size} \ {variants} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, variants=variants, chromosome=f"--chromosome {args['chromosome']}" if args.get("chromosome", None) is not None else "", gene=f"--gene {args['gene']}" if args.get("gene", None) is not None else "", - range_list=f"--range-list {args['range-list']}" if args.get("range-list", None) is not None else "", + range_list=f"--range-list {snakemake.input.range_list}" if snakemake.input.get("range_list", None) is not None else "", by_bin="--by-bin" if args.get("by-bin", False) else "", trend="--trend" if args.get("trend", False) else "", y_min=f"--y-min {args['y-min']}" if args.get("y-min", None) is not None else "", diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 6a33406da..2dc029871 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -17,13 +17,10 @@ args = snakemake.params.get("args", {}) -target = f"--target {args['target']}" if "target" in args else "" -antitarget = f"--antitarget {args['antitarget']}" if "antitarget" in args else "" - cmd = r""" cnvkit.py reference \ -o {snakemake.output.reference} \ - --fasta {args[reference]} \ + --fasta {snakemake.input.reference} \ {cluster} {min_cluster_size} \ {sample_sex} {male_reference} {diploid_parx_genome} \ {no_gc} {no_edge} {no_rmask} \ @@ -31,9 +28,9 @@ """.format( snakemake=snakemake, args=args, - target=target, - antitarget=antitarget, - normals=" ".join(args["normals"]) if len(args.get("normals", [])) > 0 else "", + target=f"--target {snakemake.input.target}" if snakemake.input.get("target", None) is not None else "", + antitarget=f"--antitarget {snakemake.input.antitarget}" if snakemake.input.get("antitarget", None) is not None else "", + normals=" ".join(snakemake.input.normals) if len(snakemake.input.normals) > 0 else "", cluster="--cluster" if args.get("cluster", False) else "", male_reference="--male-reference" if args.get("male-reference", False) else "", no_gc="--no-gc" if args.get("no-gc", False) else "", diff --git a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py index f9c370b36..4d5826063 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/genemetrics/wrapper.py @@ -17,11 +17,11 @@ cmd = r""" cnvkit.py genemetrics \ -o {snakemake.output.report} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ --threshold {args[threshold]} --min-probes {args[min-probes]} \ {drop_low_coverage} {male_reference} {sample_sex} {diploid_parx_genome} \ {stats} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py index d4a2fdc92..2fbb72c9b 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/metrics/wrapper.py @@ -19,13 +19,13 @@ cnvkit.py metrics \ -o {snakemake.output.report} \ {drop_low_coverage} \ - {args[ratios]} \ - --segment {segments} + {snakemake.input.ratios} \ + {segments} """.format( snakemake=snakemake, args=args, drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", - segments=" ".join(args["segments"]), + segments=f"--segments {snakemake.input.segments}" if snakemake.input.get("segments", None) is not None else "", ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py index 76c6831e0..99330115e 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/segmetrics/wrapper.py @@ -17,11 +17,11 @@ cmd = r""" cnvkit.py segmetrics \ -o {snakemake.output.report} \ - --segment {args[segments]} \ + --segment {snakemake.input.segments} \ --alpha {args[alpha]} --bootstrap {args[bootstrap]} {smooth_bootstrap} \ {drop_low_coverage} \ {stats} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 240cdfa28..614e4b458 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -14,11 +14,11 @@ args = snakemake.params.get("args", {}) -if "variants" in args: +if snakemake.input.get("variants", None) is not None: variants = r""" - ---vcf {args[variants]} \ + ---vcf {snakemake.input.variants} \ --sample-id {args[sample-id]} --normal-id {args[normal-id]} \ - {args[min-variant-depth]} {zygocity_freq} + --min-variant-depth {args[min-variant-depth]} {zygocity_freq} """.format( snakemake=snakemake, args=args, @@ -33,7 +33,7 @@ --method {args[method]} --threshold {args[threshold]} {smooth_cbs} \ {drop_low_coverage} --drop-outliers {args[drop-outliers]} \ {variants} \ - {args[ratios]} + {snakemake.input.ratios} """.format( snakemake=snakemake, args=args, diff --git a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py index 1869d8040..3fb4d36c0 100644 --- a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py @@ -27,7 +27,7 @@ snakemake=snakemake, args=args, diploid_parx_genome=f"--diploid-parx-genome {args['diploid-parx-genome']}" if args.get('diploid-parx-genome', None) is not None else "", - coverages=" ".join(args["coverages"]), + coverages=" ".join(snakemake.input.coverages), ) CnvkitWrapper(snakemake, cmd).run() diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index 37e1bd9c2..c05aed668 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -22,13 +22,13 @@ cnvkit.py target \ -o {snakemake.output.target} \ {avg_size} {split} {annotate} {short_names} \ - {args[interval]} + {snakemake.input.interval} """.format( snakemake=snakemake, args=args, avg_size=f"--avg-size {args['avg-size']}" if args.get("avg-size", None) is not None else "", split=f"--split" if args.get("split", False) else "", - annotate=f"--annotate {args['annotate']}" if args.get("annotate", None) is not None else "", + annotate=f"--annotate {snakemake.input.annotate}" if snakemake.input.get("annotate", None) is not None else "", short_names="--short-names" if args.get("short-names", False) else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 5ce31759d..ab57d8cee 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -345,21 +345,43 @@ def test_purecn_step_part_get_resource_usage(panel_of_normals_workflow): # Tests for CnvkitStepPart ------------------------------------------------------------------------ -def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_access()""" +def test_cnvkit_step_part_get_input_files_ignore(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_ignore()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) - actual = panel_of_normals_workflow.get_args("cnvkit", "access")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards), - ) - if actual.get("ignore_chroms", None) is not None: - actual["ignore_chroms"].sort() - expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} + actual = panel_of_normals_workflow.get_input_files("cnvkit", "ignore")(wildcards) + expected = {"reference": "/path/to/ref.fa"} assert actual == expected -def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): +def test_cnvkit_step_part_get_args_ignore(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_ignore()""" + actual = panel_of_normals_workflow.get_args("cnvkit", "ignore")(None, None) + actual["ignore_chroms"].sort() + expected = {"ignore_chroms": ["GL*", "MT"]} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_access(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "access")(wildcards) + expected = { + "reference": "/path/to/ref.fa", + "exclude": [], + "ignore_chroms": "work/bwa.cnvkit/out/bwa.cnvkit.ignored.bed", + } + assert actual == expected + + +def test_cnvkit_step_part_get_args_access(panel_of_normals_workflow): """Tests CnvkitStepPart._get_args_access()""" + actual = panel_of_normals_workflow.get_args("cnvkit", "access")(None, None) + expected = {"min-gap-size": None} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) expected = { "bams": [ @@ -367,36 +389,43 @@ def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", ], "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "method": "wgs", - "bp-per-bin": 50000, } - actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards), - ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_target()""" +def test_cnvkit_step_part_get_args_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_access()""" + expected = {"method": "wgs", "bp-per-bin": 50000} + actual = panel_of_normals_workflow.get_args("cnvkit", "autobin")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) expected = { "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "avg-size": 2000, - "split": True, + "avg_size": "work/bwa.cnvkit/out/bwa.cnvkit.autobin.txt", "annotate": "/path/to/annotations.gtf", - "short-names": True, - } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_target()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = {"avg-size": 2000, "split": True, "short-names": True} actual = panel_of_normals_workflow.get_args("cnvkit", "target")( wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards), + panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) ) assert actual == expected -def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_coverage()""" +def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -409,25 +438,35 @@ def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", "reference": "/path/to/ref.fa", - "min-mapq": 0, - "count": False, } - actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards), - ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_args_create_panel()""" +def test_cnvkit_step_part_get_args_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + expected = {"min-mapq": 0, "count": False} + actual = panel_of_normals_workflow.get_args("cnvkit", "coverage")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_reference(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_create_panel()""" wildcards = Wildcards(fromdict={"mapper": "bwa"}) expected = { "normals": [ - "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn", - "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], "reference": "/path/to/ref.fa", + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" + expected = { "cluster": False, "no-gc": False, "no-edge": True, @@ -435,23 +474,42 @@ def test_cnvkit_step_part_get_args_reference(panel_of_normals_workflow): "male-reference": False, "diploid-parx-genome": "GRCh38", } - actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")( - wildcards, - panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards), - ) + actual = panel_of_normals_workflow.get_args("cnvkit", "create_panel")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_sex(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_sex()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + expected = { + "coverages": [ + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit.P002-N1-DNA1-WGS1/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "sex")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_sex(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_args_sex()""" + expected = {"diploid-parx-genome": "GRCh38"} + actual = panel_of_normals_workflow.get_args("cnvkit", "sex")(None, None) assert actual == expected def test_cnvkit_step_parts_get_output_files(panel_of_normals_workflow): """Tests CnvkitStepPart.get_output_files() for all actions""" actions = { + "ignore": {"ignore_chroms": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.ignored.bed"}, "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, "autobin": {"result": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.autobin.txt"}, "target": {"target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed"}, "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, - "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}coverage.cnn"}, "create_panel": {"reference": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn"}, - "sex": {"sex": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.sex.tsv"}, + "sex": {"sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv"}, + "metrics": {"metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv"}, } for action, result in actions.items(): expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} @@ -462,7 +520,7 @@ def test_cnvkit_step_parts_get_output_files(panel_of_normals_workflow): def test_cnvkit_step_parts_get_log_file(panel_of_normals_workflow): """Tests CnvkitStepPart.get_log_file() for all actions""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - actions = ("autobin", "target", "create_panel", "sex") + actions = ("ignore", "access", "autobin", "target", "create_panel", "sex", "metrics") base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit" for action in actions: result = {k: base_log + f".{action}.{v}" for k, v in exts} @@ -471,16 +529,6 @@ def test_cnvkit_step_parts_get_log_file(panel_of_normals_workflow): assert actual == expected -def test_cnvkit_step_parts_get_log_file_access(panel_of_normals_workflow): - """Tests CnvkitStepPart.get_log_file() for access""" - exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" - result = {k: base_log + f".{v}" for k, v in exts} - expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} - actual = panel_of_normals_workflow.get_log_file("cnvkit", "access") - assert actual == expected - - def test_cnvkit_step_parts_get_log_file_coverage(panel_of_normals_workflow): """Tests CnvkitStepPart.get_log_file() for coverage""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) @@ -525,12 +573,20 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): expected += [ tpl.format(mapper=mapper, substep=substep, ext=ext, chksum=chksum) for chksum in ("", ".md5") - for (substep, ext) in (("panel_of_normals", "cnn"), ("sex", "tsv"), ("target", "bed")) + for (substep, ext) in (("panel_of_normals", "cnn"), ("target", "bed")) + for mapper in ("bwa",) + ] + # Add report files + tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}{chksum}" + expected += [ + tpl.format(mapper=mapper, substep=substep, ext=ext, chksum=chksum) + for chksum in ("", ".md5") + for (substep, ext) in (("metrics", "tsv"), ("sex", "tsv")) for mapper in ("bwa",) ] # add log files tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" - for substep in ("create_panel", "sex", "target"): + for substep in ("create_panel", "metrics", "sex", "target"): for mapper in ("bwa",): base_out = tpl.format(mapper=mapper, substep=substep) expected += get_expected_log_files_dict(base_out=base_out).values() diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index e9e7daa42..b245f4f35 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -137,25 +137,65 @@ def somatic_cnv_calling_workflow( # Tests for CnvkitStepPart ------------------------------------------------------------------------ +def test_cnvkit_step_part_get_input_files_ignore(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_ignore()""" + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "ignore")(None) + expected = {"reference": "/path/to/ref.fa"} + assert actual == expected + + +def test_cnvkit_step_part_get_args_ignore(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_ignore()""" + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "ignore")(None, None) + actual["ignore_chroms"].sort() + expected = {"ignore_chroms": ["GL*", "MT"]} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_access(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" + wildcards = Wildcards(fromdict={"mapper": "bwa"}) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards) + expected = { + "reference": "/path/to/ref.fa", + "exclude": [], + "ignore_chroms": "work/bwa.cnvkit/out/bwa.cnvkit.ignored.bed", + } + assert actual == expected + + def test_cnvkit_step_part_get_args_access(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_access()""" + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "access")(None, None) + expected = {"min-gap-size": None} + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_autobin(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_access()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", + "library_name": "P001-N1-DNA1-WGS1", } ) - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "access")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "access")(wildcards), - ) - if actual.get("ignore_chroms", None) is not None: - actual["ignore_chroms"].sort() - expected = {"reference": "/path/to/ref.fa", "min-gap-size": None, "exclude": [], "ignore_chroms": ["GL*", "MT"]} + expected = { + "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], + "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "autobin")(wildcards) assert actual == expected def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_access()""" + expected = {"method": "wgs", "bp-per-bin": 50000} + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "autobin")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_target(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -163,15 +203,11 @@ def test_cnvkit_step_part_get_args_autobin(somatic_cnv_calling_workflow): } ) expected = { - "bams": ["NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam"], - "access": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "method": "wgs", - "bp-per-bin": 50000, + "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", + "avg_size": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.autobin.txt", + "annotate": "/path/to/annotations.gtf", } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "autobin")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "autobin")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards) assert actual == expected @@ -183,47 +219,42 @@ def test_cnvkit_step_part_get_args_target(somatic_cnv_calling_workflow): "library_name": "P001-N1-DNA1-WGS1", } ) - expected = { - "interval": "work/bwa.cnvkit/out/bwa.cnvkit.access.bed", - "avg-size": 2000, - "split": True, - "annotate": "/path/to/annotations.gtf", - "short-names": True, - - } + expected = {"avg-size": 2000, "split": True, "short-names": True} actual = somatic_cnv_calling_workflow.get_args("cnvkit", "target")( wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards), + somatic_cnv_calling_workflow.get_input_files("cnvkit", "target")(wildcards) ) assert actual == expected -def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart._get_args_coverage()""" +def test_cnvkit_step_part_get_input_files_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", - "library_name": "P001-N1-DNA1-WGS1", + "library_name": "P001-T1-DNA1-WGS1", "region": "target", } ) expected = { "intervals": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.bed", - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", "reference": "/path/to/ref.fa", - "min-mapq": 0, - "count": False, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "coverage")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart._get_args_reference()""" +def test_cnvkit_step_part_get_args_coverage(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_coverage()""" + expected = {"min-mapq": 0, "count": False} + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "coverage")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_reference(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_create_panel()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -232,8 +263,24 @@ def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): } ) expected = { - "normals": ["work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.target.cnn"], + "normals": [ + "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + ], "reference": "/path/to/ref.fa", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "reference")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { "cluster": False, "no-gc": False, "no-edge": True, @@ -241,10 +288,23 @@ def test_cnvkit_step_part_get_args_reference(somatic_cnv_calling_workflow): "male-reference": False, "diploid-parx-genome": "GRCh38", } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "reference")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "reference")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "reference")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_fix(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_fix()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } ) + expected = { + "target": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn", + "reference": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.reference.cnn", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards) assert actual == expected @@ -257,8 +317,6 @@ def test_cnvkit_step_part_get_args_fix(somatic_cnv_calling_workflow): } ) expected = { - "target": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.target.cnn", - "reference": "work/bwa.cnvkit.P001-N1-DNA1-WGS1/out/bwa.cnvkit.P001-N1-DNA1-WGS1.reference.cnn", "cluster": False, "no-gc": False, "no-edge": True, @@ -266,10 +324,23 @@ def test_cnvkit_step_part_get_args_fix(somatic_cnv_calling_workflow): "diploid-parx-genome": "GRCh38", "sample-id": "P001-T1-DNA1-WGS1", } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "fix")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "fix")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_segment(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_segment()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } ) + expected = { + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards) assert actual == expected @@ -282,21 +353,33 @@ def test_cnvkit_step_part_get_args_segment(somatic_cnv_calling_workflow): } ) expected = { - "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", "method": "cbs", "threshold": 0.0001, "smooth-cbs": False, "drop-low-coverage": False, "drop-outliers": 10, - "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", "sample-id": "P001-T1-DNA1-WGS1", "normal-id": "P001-N1-DNA1-WGS1", "min-variant-depth": 20, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segment")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segment")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_call(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_call()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/report/bwa.cnvkit.P001-T1-DNA1-WGS1.segmetrics.cns", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + "purity_file": "SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards) assert actual == expected @@ -309,19 +392,17 @@ def test_cnvkit_step_part_get_args_call(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", "method": "threshold", "thresholds": [-1.1, -0.25, 0.2, 0.7], "drop-low-coverage": False, "male-reference": False, "diploid-parx-genome": "GRCh38", - "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", "sample-id": "P001-T1-DNA1-WGS1", "normal-id": "P001-N1-DNA1-WGS1", "min-variant-depth": 20, - "purity_file": "SOMATIC_PURITY_PLOIDY_ESTIMATE/output/bwa.ascat.P001-T1-DNA1-WGS1/out/bwa.ascat.P001-T1-DNA1-WGS1.txt", "purity": 0.35, "ploidy": 2.2, + "filter": ["ci"], } actual = somatic_cnv_calling_workflow.get_args("cnvkit", "call")( wildcards, @@ -330,8 +411,8 @@ def test_cnvkit_step_part_get_args_call(somatic_cnv_calling_workflow): assert actual == expected -def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart._get_args_bintest()""" +def test_cnvkit_step_part_get_input_files_bintest(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_bintest()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -339,21 +420,48 @@ def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "bintest")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_args_bintest(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_args_bintest()""" + expected = { "alpha": 0.005, "target": False, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "bintest")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "bintest")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "bintest")(None, None) assert actual == expected + + +def test_cnvkit_step_part_get_input_files_metrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_metrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + } + ) + expected = { + "segments": ["work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns"], + "ratios": ["work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"], + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "metrics")(wildcards) assert actual == expected def test_cnvkit_step_part_get_args_metrics(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_metrics()""" + expected = {"drop-low-coverage": False} + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "metrics")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_segmetrics(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_segmetrics()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -361,19 +469,28 @@ def test_cnvkit_step_part_get_args_metrics(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", - "drop-low-coverage": False, } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "metrics")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "metrics")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "segmetrics")(wildcards) assert actual == expected def test_cnvkit_step_part_get_args_segmetrics(somatic_cnv_calling_workflow): """Tests CnvkitStepPart._get_args_segmetrics()""" + expected = { + "drop-low-coverage": False, + "alpha": 0.05, + "bootstrap": 100, + "smooth-bootstrap": True, + "stats": ["mean", "median", "mode", "t-test", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] + } + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segmetrics")(None, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_genemetric(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_genemetrics()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -381,18 +498,10 @@ def test_cnvkit_step_part_get_args_segmetrics(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", - "drop-low-coverage": False, - "alpha": 0.05, - "bootstrap": 100, - "smooth-bootstrap": False, - "stats": ["mean", "median", "mode", "t-test", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "segmetrics")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "segmetrics")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "genemetrics")(wildcards) assert actual == expected @@ -405,8 +514,6 @@ def test_cnvkit_step_part_get_args_genemetric(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", - "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", "threshold": 0.2, "min-probes": 3, "drop-low-coverage": False, @@ -416,10 +523,25 @@ def test_cnvkit_step_part_get_args_genemetric(somatic_cnv_calling_workflow): "bootstrap": 100, "stats": ["mean", "median", "mode", "ttest", "stdev", "sem", "mad", "mse", "iqr", "bivar", "ci", "pi"] } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "genemetrics")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "genemetrics")(wildcards), + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "genemetrics")(wildcards, None) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_scatter(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart._get_input_files_scatter()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "library_name": "P001-T1-DNA1-WGS1", + "contig_name": "1", + } ) + expected = { + "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns", + "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", + "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", + } + actual = somatic_cnv_calling_workflow.get_input_files("cnvkit", "scatter")(wildcards) assert actual == expected @@ -433,8 +555,6 @@ def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): } ) expected = { - "segments": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segments.cns", - "ratios": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr", "chromosome": "1", "width": 1000000, "antitarget-marker": "o", @@ -442,38 +562,35 @@ def test_cnvkit_step_part_get_args_scatter(somatic_cnv_calling_workflow): "trend": False, "segment-color": "darkorange", "title": "P001-T1-DNA1-WGS1 - 1", - "variants": "SOMATIC_VARIANT_CALLING/output/bwa.mutect2.P001-T1-DNA1-WGS1/out/bwa.mutect2.P001-T1-DNA1-WGS1.vcf.gz", "sample-id": "P001-T1-DNA1-WGS1", "normal-id": "P001-N1-DNA1-WGS1", "min-variant-depth": 20, - "fig-size": (6.4, 4.8), + "fig-size": (12.256, 16.192), } - actual = somatic_cnv_calling_workflow.get_args("cnvkit", "scatter")( - wildcards, - somatic_cnv_calling_workflow.get_input_files("cnvkit", "scatter")(wildcards), - ) + actual = somatic_cnv_calling_workflow.get_args("cnvkit", "scatter")(wildcards, None) assert actual == expected def test_cnvkit_step_parts_get_output_files(somatic_cnv_calling_workflow): """Tests CnvkitStepPart.get_output_files() for all actions""" actions = { + "ignore": {"ignore_chroms": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.ignored.bed"}, "access": {"access": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.access.bed"}, "autobin": {"result": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.autobin.txt"}, "target": {"target": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.target.bed"}, "antitarget": {"antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed"}, - "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}.cnn"}, + "coverage": {"coverage": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{region,(target|antitarget)}coverage.cnn"}, "reference": {"reference": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.reference.cnn"}, "fix": {"ratios": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr"}, "segment": { - "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segments.cns", + "segments": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns", "dataframe": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.rds", }, - "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.calls.cns"}, + "call": {"calls": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns"}, "bintest": {"tests": "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bintest.cns"}, "metrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.metrics.tsv"}, "genemetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.genemetrics.tsv"}, - "segmetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.segmetrics.tsv"}, + "segmetrics": {"report": "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.segmetrics.cns"}, "scatter": {"plot": "work/{mapper}.cnvkit.{library_name}/plot/{mapper}.cnvkit.{library_name}.scatter.{contig_name}.jpeg"}, } for action, result in actions.items(): @@ -494,14 +611,16 @@ def test_cnvkit_step_parts_get_log_file(somatic_cnv_calling_workflow): assert actual == expected -def test_cnvkit_step_parts_get_log_file_access(somatic_cnv_calling_workflow): - """Tests CnvkitStepPart.get_log_file() for access""" +def test_cnvkit_step_parts_get_log_file_no_lib(somatic_cnv_calling_workflow): + """Tests CnvkitStepPart.get_log_file() for all actions not dependent on library""" exts = (("conda_info", "conda_info.txt"), ("conda_list", "conda_list.txt"), ("log", "log"), ("sh", "sh")) - base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.access" - result = {k: base_log + f".{v}" for k, v in exts} - expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} - actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", "access") - assert actual == expected + actions = ("ignore", "access") + base_log = "work/{mapper}.cnvkit/log/{mapper}.cnvkit." + for action in actions: + result = {k: base_log + f"{action}.{v}" for k, v in exts} + expected = result | {k + "_md5": v + ".md5" for k, v in result.items()} + actual = somatic_cnv_calling_workflow.get_log_file("cnvkit", action) + assert actual == expected def test_cnvkit_step_parts_get_log_file_coverage(somatic_cnv_calling_workflow): @@ -542,7 +661,7 @@ def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): tpl = "output/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" expected += [ tpl.format(mapper=mapper, library_name=library_name, ext=ext) - for ext in ("cnr", "segments.cns", "calls.cns", "bintest.cns") + for ext in ("cnr", "cns", "call.cns", "bintest.cns") for library_name in tumor_libraries for mapper in ("bwa",) ] @@ -561,8 +680,7 @@ def test_somatic_cnv_calling_workflow(somatic_cnv_calling_workflow): tpl = "output/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}.{step}.{ext}" expected += [ tpl.format(mapper=mapper, library_name=library_name, step=step, ext=ext) - for ext in ("tsv",) - for step in ("metrics", "genemetrics", "segmetrics") + for step, ext in (("metrics", "tsv"), ("genemetrics", "tsv"), ("segmetrics", "cns")) for library_name in tumor_libraries for mapper in ("bwa",) ] From 3542c5badee0cd5c1273581a901f045f781f4c81 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 6 Dec 2024 12:34:50 +0100 Subject: [PATCH 42/46] docs: Added somatic cnv calling step --- docs/index.rst | 4 +-- docs/somatic_cnv.rst | 49 ++++++++++++++++++++++++++----- docs/step/somatic_cnv_calling.rst | 7 +++++ 3 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 docs/step/somatic_cnv_calling.rst diff --git a/docs/index.rst b/docs/index.rst index 19f7cc3cd..c4493e7a0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -68,13 +68,13 @@ Project Info step/igv_session_generation step/ngs_data_qc step/ngs_mapping + step/panel_of_normals step/somatic_gene_fusion_calling step/somatic_purity_ploidy_estimate - step/somatic_targeted_seq_cnv_calling + step/somatic_cnv_calling step/somatic_variant_annotation step/somatic_variant_calling step/somatic_variant_filtration - step/somatic_wgs_cnv_calling step/somatic_wgs_sv_calling step/sv_calling_targeted step/targeted_seq_mei_calling diff --git a/docs/somatic_cnv.rst b/docs/somatic_cnv.rst index bc2e1e005..68e817f04 100644 --- a/docs/somatic_cnv.rst +++ b/docs/somatic_cnv.rst @@ -4,17 +4,47 @@ Somatic CNV calling ------------------- -Somatic variant calling is implemented differently for exome and whole genome data. - -The whole genome data "branch" is currently under review, as GRCh38 support in ``Control-FREEC`` (the main workhorse for WGS CNV calling) is not complete. -CNV calling in WGS data can also be done using ``cnvkit``, but its pipeline implementation is also incomplete. - -The following documentation is restricted to the tools currently implemented to process exome data: ``cnvkit``, ``purecn`` & ``sequenza``. +Somatic variant calling was implemented differently for exome and whole genome data. +We are aiming to merge the two data types, but this isn't complete yet, and the new ``somatic_cnv_calling`` step coexists +with the former ``somatic_wgs_cnv_calling`` & ``somatic_targeted_seq_cnv_calling`` steps. + +Among the tools for CNV calling, some are restricted to WES (``PureCN`` & ``Sequenza``), others to WGS (``Control-FREEC``) and +others can support both (``CNVkit``). + +We started by implementing the ``cnvkit`` tool in the new step, but support for the other tools will follow soon, +but a complete implementation is difficult. The reasons are: + +- Many of these tools (in particular ``CNVkit`` & ``PureCN``) are modular, which allows them flexibility, but + at the expense of additional complexity in the pipeline. +- Some tools require a *reference*, or *panel of normals*, which can be either obtained from public data, or created from the cohort itself. + In the latter case, the pipeline must trigger panel of normals creation when needed. +- Germline & somatic variants can be used by some tools to improve segmentation & calling. + Again, the pipeline must trigger creation of those. +- When available, purity & ploidy values improve CNV calling. While some tools can estimate these values internally, + others require running an additional step. + +So in the future we might find ourselves in a position where, to generate CNV calls by ``CNVkit``, the chain of data requests goes: + +1. ``cnvkit`` requires a panel of normals, purity/ploidy from tool ``purecn`` in ``somatic_purity_ploidy_estimate`` step, & + germline/somatic variants from tool ``mutect2`` in ``somatic_variant_calling`` step. +2. The ``panel_of_normals`` step is triggered for tool ``cnvkit``, and in parallel, the ``somatic_variant_calling`` step + is triggered for tool ``mutect2``. +3. For ``purecn``, the ``somatic_purity_ploidy_estimate`` step is simply a copy of the results obtained in + the ``somatic_cnv_calling`` step, as ``PureCN`` estimates purity & ploidy during CNV calling. + So the ``somatic_cnv_calling`` step must be run for ``purecn``. +4. But ``purecn`` itself requires a panel of normals, & germline/somatic variants produced in the ``somatic_variant_calling`` step, + by the ``mutect2`` tool. +5. The ``panel_of_normals`` step is triggered for tool ``purecn``, while the ``somatic_variant_calling`` step, already triggered + at ``cnvkit`` request, may or may not be computed. + +So the final dependencies graph is quite complex, and assumes that the ``mutect2`` variant generation parameters +necessary for ``CNVkit`` are compatible with with those required by ``PureCN``. Output & performance ==================== -The 3 methods generally broadly agree on the log ratio of coverage between tumor & normal samples. +For WES data, the 3 methods currently implemented (``CNVkit``, ``PureCN`` & ``Sequenza``) generally broadly agree +on the log ratio of coverage between tumor & normal samples. However, the segmentation and the number of copies assigned to a segment can be quite different between the algorithms. @@ -26,6 +56,10 @@ In absence of a better solution, all CNV tools implemented in somatic pipeline o The copy number call may or may not be present, and the chromosome number is replaced by its name. The segmentation output is in file ``output/../out/.._dnacopy.seg``. +The new step doesn't follow this convention, but keeps the output files generated by ``cnvkit`` unchanged. +When a final decision is made regarding the best format(s) to describe CNVs, then we will implement an ``export`` sub-step +which converts private formats to public one(s). + Genome support -------------- @@ -93,4 +127,3 @@ From the ``panel_of_normals`` directory, ``purecn`` requires 3 types of files: - the ``panel_of_normals`` itself, and the ``mapping_bias`` objects are taken from ``.purecn/out``. This is because they might change with different mapping tools. - the ``intervals`` taken from ``purecn/out``, as the definition of intervals depend only on the genome & the exome kit, but not on the mapping tool. - the ``container`` taken from ``work/containers/out``, to ensure that the ``PureCN`` version used to compute copy number variants is identical to that used to compute the panel of normals. - diff --git a/docs/step/somatic_cnv_calling.rst b/docs/step/somatic_cnv_calling.rst new file mode 100644 index 000000000..71e502a35 --- /dev/null +++ b/docs/step/somatic_cnv_calling.rst @@ -0,0 +1,7 @@ +.. _step_somatic_cnv_calling:: + +============================ +Somatic Copy Number Variants +============================ + +.. automodule:: snappy_pipeline.workflows.somatic_cnv_calling From a85d9cf15c70c832aa1c45a3b445c132bc9dde0b Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 6 Dec 2024 12:37:05 +0100 Subject: [PATCH 43/46] fix: Use cnvkit batch default for WGS --- snappy_pipeline/models/cnvkit.py | 5 +++-- snappy_pipeline/workflows/somatic_cnv_calling/__init__.py | 5 ++++- snappy_wrappers/wrappers/cnvkit/segment/wrapper.py | 5 +++-- .../workflows/test_workflows_somatic_cnv_calling.py | 4 +--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py index 2063d52f7..6831e5ad4 100644 --- a/snappy_pipeline/models/cnvkit.py +++ b/snappy_pipeline/models/cnvkit.py @@ -152,11 +152,12 @@ class Fix(SnappyModel): class Segment(SnappyModel): method: SegmentationMethod = SegmentationMethod.CBS """Segmentation method, or 'none' for chromosome arm-level averages as segments""" - threshold: float + threshold: float | None = None """ Significance threshold (p-value or FDR, depending on method) to accept breakpoints during segmentation. For HMM methods, this is the smoothing window size. + Automatically set to 1e-6 when missing, and in WGS mode. """ drop_outliers: int = 10 """Drop outlier bins more than this many multiples of the 95th quantile away from the average within a rolling window. Set to 0 for no outlier filtering.""" @@ -355,7 +356,7 @@ class CnvkitToReference(SnappyModel): class Cnvkit(CnvkitToReference): fix: Fix = Fix() - segment: Segment + segment: Segment = Segment() call: Call = Call() bintest: Bintest = Bintest() diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py index 8120483ca..b48b224c3 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_calling/__init__.py @@ -878,10 +878,13 @@ def _get_args_segment(self, wildcards: Wildcards, input: InputFiles) -> dict[str # Segmentation parameters args = { "method": self.cfg.segment.method, - "threshold": self.cfg.segment.threshold, "drop-outliers": self.cfg.segment.drop_outliers, "drop-low-coverage": self.cfg.drop_low_coverage, } + if self.cfg.segment.threshold is not None: + args["threshold"] = self.cfg.segment.threshold + elif self.is_wgs: + args["threshold"] = 1e-6 if self.cfg.segment.method == CnvkitSegmentationMethod.CBS: args["smooth-cbs"] = self.cfg.segment.smooth_cbs if self.cfg.somatic_variant_calling.enabled: diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 614e4b458..784d66eb5 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -22,7 +22,7 @@ """.format( snakemake=snakemake, args=args, - zygocity_freq=f"--zygocity_freq {args['zygocity-freq']}" if "zygocity-freq" in args else "" + zygocity_freq=f"--zygocity_freq {args['zygocity-freq']}" if args.get("zygocity-freq", None) is not None else "" ) else: variants = "" @@ -30,7 +30,7 @@ cmd = r""" cnvkit.py segment --processes {snakemake.resources._cores} \ -o {snakemake.output.segments} --dataframe {snakemake.output.dataframe} \ - --method {args[method]} --threshold {args[threshold]} {smooth_cbs} \ + --method {args[method]} {threshold} {smooth_cbs} \ {drop_low_coverage} --drop-outliers {args[drop-outliers]} \ {variants} \ {snakemake.input.ratios} @@ -38,6 +38,7 @@ snakemake=snakemake, args=args, variants=variants, + threshold=f"--threshold {args['threshold']}" if args.get("thresold", None) is not None else "", smooth_cbs="--smooth-cbs" if args.get("smooth-cbs", False) else "", drop_low_coverage="--drop-low-coverage" if args.get("drop-low-coverage", False) else "", ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py index b245f4f35..31382ee88 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_calling.py @@ -68,8 +68,6 @@ def minimal_config(): enabled: True source: cohort tool: ascat - segment: - threshold: 0.0001 scatter: enabled: true @@ -354,7 +352,7 @@ def test_cnvkit_step_part_get_args_segment(somatic_cnv_calling_workflow): ) expected = { "method": "cbs", - "threshold": 0.0001, + "threshold": 1e-6, "smooth-cbs": False, "drop-low-coverage": False, "drop-outliers": 10, From 18159890330cd9446a46cfdbfc4f0ae488aba26e Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 6 Dec 2024 12:38:31 +0100 Subject: [PATCH 44/46] refactor: Remove cnvkit from somatic_target_seq_cnv_calling step --- .../__init__.py | 260 ---------- .../somatic_targeted_seq_cnv_calling/model.py | 5 +- ...kflows_somatic_targeted_seq_cnv_calling.py | 465 ------------------ 3 files changed, 1 insertion(+), 729 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 6440fe5f2..bec3d9bd3 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -57,7 +57,6 @@ Available Somatic Targeted CNV Caller ===================================== -- ``cnvkit`` - ``sequenza`` - ``purecn``. Note that ``purecn`` requires a panel of normals and a second set of variants called by ``mutect2``, that includes germline ones. - ``copywriter`` (deprecated, the `R` package was removed with Bioconductor release 3.18) @@ -556,263 +555,6 @@ def get_log_file(self, action): return self._get_log_file_from_prefix(prefix) -class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): - """Perform somatic targeted CNV calling using cnvkit""" - - #: Step name - name = "cnvkit" - - #: Class available actions - actions = ( - "coverage", - "fix", - "segment", - "call", - "postprocess", - "export", - "plot", - "report", - ) - - # Overwrite defaults - default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h - - #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). - resource_usage = { - "plot": ResourceUsage( - threads=1, - time="08:00:00", # 1 day - memory=f"{30 * 1024}M", - ), - "coverage": ResourceUsage( - threads=8, - time="08:00:00", # 8 hours - memory=f"{16 * 1024}M", - ), - } - - def __init__(self, parent): - super().__init__(parent) - - def get_input_files(self, action): - """Return input paths input function, dependent on rule""" - # Validate action - self._validate_action(action) - method_mapping = { - "coverage": self._get_input_files_coverage, - "call": self._get_input_files_call, - "fix": self._get_input_files_fix, - "segment": self._get_input_files_segment, - "postprocess": self._get_input_files_postprocess, - "export": self._get_input_files_export, - "plot": self._get_input_files_plot, - "report": self._get_input_files_report, - } - return method_mapping[action] - - def _get_input_files_coverage(self, wildcards): - # BAM/BAI file - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files = { - "bam": ngs_mapping(base_path + ".bam"), - "bai": ngs_mapping(base_path + ".bam.bai"), - } - return input_files - - @staticmethod - def _get_input_files_fix(wildcards): - tpl_base = "{mapper}.cnvkit.{library_name}" - tpl = "work/" + tpl_base + "/out/" + tpl_base + ".{target}coverage.cnn" - input_files = { - "target": tpl.format(target="target", **wildcards), - "antitarget": tpl.format(target="antitarget", **wildcards), - } - return input_files - - @staticmethod - def _get_input_files_segment(wildcards): - cnr_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" - input_files = {"cnr": cnr_pattern.format(**wildcards)} - return input_files - - @staticmethod - def _get_input_files_call(wildcards): - segment_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" - ) - input_files = {"segment": segment_pattern.format(**wildcards)} - return input_files - - @staticmethod - def _get_input_files_postprocess(wildcards): - segment_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" - ) - call_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns" - ) - input_files = { - "segment": segment_pattern.format(**wildcards), - "call": call_pattern.format(**wildcards), - } - return input_files - - @staticmethod - def _get_input_files_export(wildcards): - cns_pattern = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns" - ) - input_files = {"cns": cns_pattern.format(**wildcards)} - return input_files - - @staticmethod - def _get_input_files_plot(wildcards): - tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" - input_files = { - "cnr": tpl.format(ext="cnr", **wildcards), - "cns": tpl.format(ext="call.cns", **wildcards), - } - return input_files - - def _get_input_files_report(self, wildcards): - tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" - input_files = { - "target": tpl.format(ext="targetcoverage.cnn", **wildcards), - "antitarget": tpl.format(ext="antitargetcoverage.cnn", **wildcards), - "cnr": tpl.format(ext="cnr", **wildcards), - "cns": tpl.format(ext="call.cns", **wildcards), - } - return input_files - - def get_output_files(self, action): - """Return output files for the given action""" - if action == "coverage": - return self._get_output_files_coverage() - elif action == "fix": - return self._get_output_files_fix() - elif action == "segment": - return self._get_output_files_segment() - elif action == "call": - return self._get_output_files_call() - elif action == "postprocess": - return self._get_output_files_postprocess() - elif action == "export": - return self._get_output_files_export() - elif action == "plot": - return self._get_output_files_plot() - elif action == "report": - return self._get_output_files_report() - else: - self._validate_action(action) - - @staticmethod - def _get_output_files_coverage(): - name_pattern = "{mapper}.cnvkit.{library_name}" - output_files = {} - for target in ("target", "antitarget"): - output_files[target] = os.path.join( - "work", name_pattern, "out", name_pattern + ".{}coverage.cnn".format(target) - ) - output_files[target + "_md5"] = output_files[target] + ".md5" - return output_files - - @staticmethod - def _get_output_files_fix(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnr") - return {"ratios": tpl, "ratios_md5": tpl + ".md5"} - - @staticmethod - def _get_output_files_segment(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".segment.cns") - return {"segments": tpl, "segments_md5": tpl + ".md5"} - - @staticmethod - def _get_output_files_call(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".call.cns") - return {"calls": tpl, "calls_md5": tpl + ".md5"} - - @staticmethod - def _get_output_files_postprocess(): - name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + "_dnacopy.seg") - return { - "final": tpl, - "final_md5": tpl + ".md5", - } - - @dictify - def _get_output_files_plot(self): - plots = (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - chrom_plots = (("heatmap", "pdf"), ("scatter", "png")) - chroms = list(chain(range(1, 23), ["X", "Y"])) - output_files = {} - # Yield file name pairs for global plots - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" - ) - for plot, ext in plots: - output_files[plot] = tpl.format(plot=plot, ext=ext) - output_files[plot + "_md5"] = output_files[plot] + ".md5" - # Yield file name pairs for the chromosome-wise plots - tpl_chrom = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" - ) - for plot, ext in chrom_plots: - for chrom in chroms: - key = "{plot}_chr{chrom}".format(plot=plot, chrom=chrom) - output_files[key] = tpl_chrom.format(plot=plot, ext=ext, chrom=chrom) - output_files[key + "_md5"] = output_files[key] + ".md5" - return output_files - - @staticmethod - def _get_output_files_export(): - exports = ( - ("bed", "bed.gz"), - ("bed_tbi", "bed.gz.tbi"), - ("seg", "seg"), - ("vcf", "vcf.gz"), - ("vcf_tbi", "vcf.gz.tbi"), - ) - output_files = {} - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/out/" - "{{mapper}}.cnvkit.{{library_name}}.{ext}" - ) - for export, ext in exports: - output_files[export] = tpl.format(export=export, ext=ext) - output_files[export + "_md5"] = output_files[export] + ".md5" - return output_files - - @dictify - def _get_output_files_report(self): - reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - output_files = {} - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{report}.txt" - ) - for report in reports: - output_files[report] = tpl.format(report=report) - output_files[report + "_md5"] = output_files[report] + ".md5" - return output_files - - def get_log_file(self, action): - """Return path to log file for the given action""" - # Validate action - self._validate_action(action) - prefix = ( - "work/{{mapper}}.cnvkit.{{library_name}}/log/" - "{{mapper}}.cnvkit.{action}.{{library_name}}" - ).format(action=action) - return self._get_log_file_from_prefix(prefix) - - class CopywriterStepPart(SomaticTargetedSeqCnvCallingStepPart): """Perform somatic targeted CNV calling using CopywriteR""" @@ -972,7 +714,6 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ( CnvettiOffTargetStepPart, CnvettiOnTargetStepPart, - CnvKitStepPart, CopywriterStepPart, SequenzaStepPart, PureCNStepPart, @@ -992,7 +733,6 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) def get_result_files(self): """Return list of result files for the somatic targeted sequencing CNV calling step""" tool_actions = { - "cnvkit": ["fix", "postprocess", "report", "export"], "sequenza": ("coverage", "run"), "purecn": ("run",), "copywriter": ("call",), diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py index 1fbacf9d7..d9633ef9b 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py @@ -4,11 +4,9 @@ from pydantic import Field from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators -from snappy_pipeline.models.cnvkit import Cnvkit class Tool(enum.StrEnum): - cnvkit = "cnvkit" sequenza = "sequenza" copywriter = "copywriter" cnvetti_on_target = "cnvetti_on_target" @@ -230,10 +228,9 @@ class CnvettiOffTarget(SnappyModel): class SomaticTargetedSeqCnvCalling(SnappyStepModel, validators.ToolsMixin): - tools: Annotated[list[Tool], EnumField(Tool, [Tool.cnvkit], min_length=1)] + tools: Annotated[list[Tool], EnumField(Tool, [Tool.purecn], min_length=1)] path_ngs_mapping: str = "../ngs_mapping" - cnvkit: Cnvkit | None = None sequenza: Sequenza | None = None copywriter: CopyWriter | None = None purecn: PureCn | None = None diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index 815acdf73..f6d5b8bf7 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -41,14 +41,9 @@ def minimal_config(): somatic_targeted_seq_cnv_calling: tools: - cnvetti_on_target - # - cnvkit - copywriter - sequenza - purecn - # cnvkit: - # path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - # path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - # path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn purecn: path_container: /path/to/purecn/container path_intervals: /path/to/interval/list @@ -290,418 +285,6 @@ def test_cnvetti_on_target_step_part_get_resource_usage(somatic_targeted_seq_cnv assert actual == expected, msg_error -# Tests for CnvKitStepPart (coverage) ------------------------------------------------------------- - - -def test_cnvkit_coverage_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards( - fromdict={"mapper": "bwa", "target": "target", "library_name": "P001-T1-DNA1-WGS1"} - ) - expected = { - "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", - "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "coverage")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_coverage_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" - expected = { - "target": base_name_out + ".targetcoverage.cnn", - "target_md5": base_name_out + ".targetcoverage.cnn.md5", - "antitarget": base_name_out + ".antitargetcoverage.cnn", - "antitarget_md5": base_name_out + ".antitargetcoverage.cnn.md5", - } - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "coverage") - - assert actual == expected - - -def test_cnvkit_coverage_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - base_file_name = ( - "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" - ) - expected = get_expected_log_files_dict(base_out=base_file_name) - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "coverage") - assert actual == expected - - -def test_cnvkit_coverage_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'coverage'""" - # Define expected - expected_dict = {"threads": 8, "time": "08:00:00", "memory": "16384M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "coverage", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (fix) ------------------------------------------------------------------ - - -def test_cnvkit_fix_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - coverage_base_out = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1" - expected = { - "antitarget": coverage_base_out + ".antitargetcoverage.cnn", - "target": coverage_base_out + ".targetcoverage.cnn", - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards) - assert actual == expected - - -def test_cnvkit_fix_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" - expected = {"ratios": base_name_out, "ratios_md5": base_name_out + ".md5"} - assert somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "fix") == expected - - -def test_cnvkit_fix_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "fix") - assert actual == expected - - -def test_cnvkit_fix_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource("cnvkit", "fix", resource)() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (segment) -------------------------------------------------------------- - - -def test_cnvkit_segment_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = {"cnr": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"} - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "segment")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_segment_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" - ) - expected = {"segments": base_name_out, "segments_md5": base_name_out + ".md5"} - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "segment") - assert actual == expected - - -def test_cnvkit_segment_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "segment") - assert actual == expected - - -def test_cnvkit_segment_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "segment", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (call) ----------------------------------------------------------------- - - -def test_cnvkit_call_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" - expected = {"segment": segment_file} - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards) - assert actual == expected - - -def test_cnvkit_call_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = ( - "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.call.cns" - ) - expected = {"calls": base_name_out, "calls_md5": base_name_out + ".md5"} - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "call") - assert actual == expected - - -def test_cnvkit_call_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.call.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "call") - assert actual == expected - - -def test_cnvkit_call_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "call", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (postprocess) ---------------------------------------------------------- - - -def test_cnvkit_postprocess_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" - call_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - expected = {"segment": segment_file, "call": call_file} - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "postprocess")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_postprocess_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" - expected = { - "final": base_name_out + "_dnacopy.seg", - "final_md5": base_name_out + "_dnacopy.seg.md5", - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "postprocess") - assert actual == expected - - -def test_cnvkit_postprocess_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.postprocess.{library_name}" - ) - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "postprocess") - assert actual == expected - - -def test_cnvkit_postprocess_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'postprocess'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "postprocess", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (plot) ----------------------------------------------------------------- - - -def test_cnvkit_plot_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" - cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - expected = { - "cnr": cnr_file, - "cns": cns_file, - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "plot")(wildcards) - assert actual == expected - - -def test_cnvkit_plot_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = {} - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" - ) - for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")): - expected[plot] = tpl.format(plot=plot, ext=ext) - expected[plot + "_md5"] = expected[plot] + ".md5" - tpl = ( - "work/{{mapper}}.cnvkit.{{library_name}}/report/" - "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" - ) - for plot, ext in (("heatmap", "pdf"), ("scatter", "png")): - for chrom in chain(range(1, 23), ("X", "Y")): - key = "{plot}_chr{chrom}".format(plot=plot, chrom=str(chrom)) - expected[key] = tpl.format(plot=plot, ext=ext, chrom=str(chrom)) - expected[key + "_md5"] = expected[key] + ".md5" - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "plot") - assert actual == expected - - -def test_cnvkit_plot_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = get_expected_log_files_dict( - base_out="work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" - ) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "plot") - assert actual == expected - - -def test_cnvkit_plot_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "08:00:00", "memory": "30720M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "plot", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (export) --------------------------------------------------------------- - - -def test_cnvkit_export_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "cns": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "export")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = {} - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" - for key, ext in ( - ("bed", "bed.gz"), - ("bed_tbi", "bed.gz.tbi"), - ("seg", "seg"), - ("vcf", "vcf.gz"), - ("vcf_tbi", "vcf.gz.tbi"), - ): - expected[key] = base_name_out + "." + ext - expected[key + "_md5"] = expected[key] + ".md5" - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "export") - assert actual == expected - - -def test_cnvkit_export_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.export.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "export") - assert actual == expected - - -def test_cnvkit_export_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "export", resource - )() - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (report) --------------------------------------------------------------- - - -def test_cnvkit_report_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" - cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.call.cns" - target_file = ( - "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn" - ) - antitarget_file = ( - "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.antitargetcoverage.cnn" - ) - expected = { - "cnr": cnr_file, - "cns": cns_file, - "target": target_file, - "antitarget": antitarget_file, - } - # Get actual - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "report")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_report_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - expected = {} - base_name_out = "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}" - for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics"): - expected[report] = base_name_out + "." + report + ".txt" - expected[report + "_md5"] = expected[report] + ".md5" - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_report_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.report.{library_name}" - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "report") - assert actual == expected - - -def test_cnvkit_report_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" - # Define expected - expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "report", resource - )() - assert actual == expected, msg_error - - # Tests for CopywriterStepPart ------------------------------------------------------------------- @@ -1046,7 +629,6 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call expected = [ "cnvetti_off_target", "cnvetti_on_target", - "cnvkit", "copywriter", "link_out", "purecn", @@ -1106,53 +688,6 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call "targets_segmented.txt.md5", ) ] - # cnvkit - # tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="cnvkit") + "{ext}{md5}" - # expected += [ - # tpl.format(i=i, t=t, ext=ext, md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for ext in ( - # ".cnr", - # "_dnacopy.seg", - # ".bed.gz", - # ".bed.gz.tbi", - # ".seg", - # ".vcf.gz", - # ".vcf.gz.tbi", - # ) - # for md5 in ("", ".md5") - # ] - # tpl = ( - # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - # + ".{plot}.{ext}{md5}" - # ) - # expected += [ - # tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) - # for md5 in ("", ".md5") - # ] - # tpl = ( - # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - # + ".{plot}.chr{chrom}.{ext}{md5}" - # ) - # expected += [ - # tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) - # for chrom in chain(range(1, 23), ("X", "Y")) - # for md5 in ("", ".md5") - # ] - # tpl = ( - # f"output/{name_pattern}/report/{name_pattern}".format(method="cnvkit") - # + ".{report}.txt{md5}" - # ) - # expected += [ - # tpl.format(i=i, t=t, report=report, md5=md5) - # for i, t in ((1, 1), (2, 1), (2, 2)) - # for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") - # for md5 in ("", ".md5") - # ] # copywriter tpl = f"output/{name_pattern}/out/{name_pattern}".format(method="copywriter") + "_{ext}{md5}" expected += [ From f486af28a7b2a875f1f5b8423830f3c44b23d3ca Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 6 Dec 2024 14:42:13 +0100 Subject: [PATCH 45/46] remove unused imports --- snappy_pipeline/workflows/panel_of_normals/__init__.py | 1 - snappy_pipeline/workflows/somatic_cnv_checking/__init__.py | 1 - .../workflows/somatic_targeted_seq_cnv_calling/__init__.py | 1 - 3 files changed, 3 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 4538a2ad7..ab1830be0 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -237,7 +237,6 @@ from snappy_pipeline.models.common import SexOrigin, SexValue from .model import PanelOfNormals as PanelOfNormalsConfigModel -from .model import PureCn as PureCnConfig from .model import CnvKit as CnvkitConfig __author__ = "Manuel Holtgrewe " diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py index 807a2f91d..9266614fe 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py @@ -63,7 +63,6 @@ from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand -from snappy_pipeline.base import InvalidConfiguration from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index bec3d9bd3..a21ab0800 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -68,7 +68,6 @@ import os.path import sys from collections import OrderedDict -from itertools import chain from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand From 835acd89f27a3075c834c44e2b4fffafce1ec6ec Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 6 Dec 2024 14:45:46 +0100 Subject: [PATCH 46/46] snakefmt --- snappy_pipeline/workflows/panel_of_normals/Snakefile | 4 ++-- snappy_pipeline/workflows/somatic_cnv_calling/Snakefile | 1 - snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index fb5c8b6a0..27652cb95 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -106,7 +106,7 @@ rule panel_of_normals_mutect2_create_panel: rule panel_of_normals_cnvkit_ignore: input: - unpack(wf.get_input_files("cnvkit", "ignore")), + unpack(wf.get_input_files("cnvkit", "ignore")), output: **wf.get_output_files("cnvkit", "ignore"), threads: wf.get_resource("cnvkit", "ignore", "threads") @@ -124,7 +124,7 @@ rule panel_of_normals_cnvkit_ignore: rule panel_of_normals_cnvkit_access: input: - unpack(wf.get_input_files("cnvkit", "access")), + unpack(wf.get_input_files("cnvkit", "access")), output: **wf.get_output_files("cnvkit", "access"), threads: wf.get_resource("cnvkit", "access", "threads") diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile index 23e608e34..12150d5a4 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_cnv_calling/Snakefile @@ -58,4 +58,3 @@ rule somatic_cnv_calling_link_out_run: # cnvkit requires a large number of rules, thus externalized include: "cnvkit.rules" - diff --git a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules index a6a343e24..9783bd6eb 100644 --- a/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules +++ b/snappy_pipeline/workflows/somatic_cnv_calling/cnvkit.rules @@ -224,8 +224,8 @@ rule somatic_cnv_calling_cnvkit_bintest: # tmpdir=wf.get_resource("cnvkit", "plot/diagram", "tmpdir"), # wrapper: # wf.wrapper_path("cnvkit/plot/diagram") -# -# +# +# rule somatic_seq_cnv_calling_cnvkit_plot_scatter: input: unpack(wf.get_input_files("cnvkit", "scatter")),