From c1a215ed31f50d4237e8a5f7f285733241eebffe Mon Sep 17 00:00:00 2001 From: ericblanc20 Date: Wed, 30 Aug 2023 17:44:48 +0200 Subject: [PATCH] feat: added support for sequenza (414-2) (#436) --- .../Snakefile | 73 +++++++ .../__init__.py | 199 ++++++++++++++---- snappy_wrappers/wrappers/r/wrapper.py | 4 +- .../wrappers/sequenza/environment.yaml | 10 + .../sequenza/gcreference/environment.yaml | 1 + .../wrappers/sequenza/gcreference/wrapper.py | 50 +++++ .../wrappers/sequenza/report/environment.yaml | 1 + .../wrappers/sequenza/report/wrapper.py | 98 +++++++++ .../wrappers/sequenza/run/environment.yaml | 1 + .../wrappers/sequenza/run/wrapper.py | 71 +++++++ ...kflows_somatic_targeted_seq_cnv_calling.py | 134 +++++++++++- 11 files changed, 601 insertions(+), 41 deletions(-) create mode 100644 snappy_wrappers/wrappers/sequenza/environment.yaml create mode 120000 snappy_wrappers/wrappers/sequenza/gcreference/environment.yaml create mode 100644 snappy_wrappers/wrappers/sequenza/gcreference/wrapper.py create mode 120000 snappy_wrappers/wrappers/sequenza/report/environment.yaml create mode 100644 snappy_wrappers/wrappers/sequenza/report/wrapper.py create mode 120000 snappy_wrappers/wrappers/sequenza/run/environment.yaml create mode 100644 snappy_wrappers/wrappers/sequenza/run/wrapper.py diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile index 91d449628..37c8217da 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile @@ -233,3 +233,76 @@ rule somatic_targeted_seq_cnv_calling_cnvetti_off_target_postprocess: **wf.get_log_file("cnvetti_off_target", "postprocess"), wrapper: wf.wrapper_path("cnvetti/on_target/postprocess") + + +# Run sequenza ---------------------------------------------------- + + +rule somatic_targeted_seq_cnv_calling_sequenza_install: + output: + **wf.get_output_files("sequenza", "install"), + params: + packages=[ + {"name": "aroneklund/copynumber", "repo": "github"}, + ], + threads: wf.get_resource("sequenza", "install", "threads") + resources: + time=wf.get_resource("sequenza", "install", "time"), + memory=wf.get_resource("sequenza", "install", "memory"), + partition=wf.get_resource("sequenza", "install", "partition"), + tmpdir=wf.get_resource("sequenza", "install", "tmpdir"), + log: + **wf.get_log_file("sequenza", "install"), + wrapper: + wf.wrapper_path("r") + + +rule somatic_targeted_seq_cnv_calling_sequenza_gcreference: + output: + **wf.get_output_files("sequenza", "gcreference"), + threads: wf.get_resource("sequenza", "gcreference", "threads") + resources: + time=wf.get_resource("sequenza", "gcreference", "time"), + memory=wf.get_resource("sequenza", "gcreference", "memory"), + partition=wf.get_resource("sequenza", "gcreference", "partition"), + tmpdir=wf.get_resource("sequenza", "gcreference", "tmpdir"), + log: + **wf.get_log_file("sequenza", "gcreference"), + wrapper: + wf.wrapper_path("sequenza/gcreference") + + +rule somatic_targeted_seq_cnv_calling_sequenza_run: + input: + unpack(wf.get_input_files("sequenza", "run")), + output: + **wf.get_output_files("sequenza", "run"), + threads: wf.get_resource("sequenza", "run", "threads") + resources: + time=wf.get_resource("sequenza", "run", "time"), + memory=wf.get_resource("sequenza", "run", "memory"), + partition=wf.get_resource("sequenza", "run", "partition"), + tmpdir=wf.get_resource("sequenza", "run", "tmpdir"), + log: + **wf.get_log_file("sequenza", "run"), + wrapper: + wf.wrapper_path("sequenza/run") + + +rule somatic_targeted_seq_cnv_calling_sequenza_report: + input: + **wf.get_input_files("sequenza", "report"), + output: + **wf.get_output_files("sequenza", "report"), + params: + sample_id=wf.get_params("sequenza", "report"), + threads: wf.get_resource("sequenza", "report", "threads") + resources: + time=wf.get_resource("sequenza", "report", "time"), + memory=wf.get_resource("sequenza", "report", "memory"), + partition=wf.get_resource("sequenza", "report", "partition"), + tmpdir=wf.get_resource("sequenza", "report", "tmpdir"), + log: + **wf.get_log_file("sequenza", "report"), + wrapper: + wf.wrapper_path("sequenza/report") diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 4f80a78c5..484247adc 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -104,6 +104,7 @@ from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -120,7 +121,7 @@ # Default configuration somatic_targeted_seq_cnv_calling step_config: somatic_targeted_seq_cnv_calling: - tools: ['cnvkit'] # REQUIRED - available: 'cnvkit', 'copywriter', 'cnvetti_on_target' and 'cnvetti_off_target' + tools: ['cnvkit'] # REQUIRED - available: 'cnvkit', 'sequenza', 'cnvetti_on_target', 'cnvetti_off_target' and 'copywriter' (deprecated) path_ngs_mapping: ../ngs_mapping # REQUIRED cnvkit: path_target: REQUIRED # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed @@ -162,6 +163,27 @@ segmetrics_alpha: 0.05 # [segmetrics] Significance cutoff segmetrics_bootstrap: 100 # [segmetrics] Number of bootstraps smooth_bootstrap: False # [segmetrics] Smooth bootstrap results + sequenza: + length: 50 + assembly: "hg19" # Must be hg38 for GRCh38. See copynumber for complete list (augmented with hg38) + extra_args: {} # Extra arguments for sequenza bam2seqz, valid values: + # hom: 0.9 # Threshold to select homozygous positions + # het: 0.25 # Threshold to select heterozygous positions + # qlimit: 20 # Minimum nucleotide quality score for inclusion in the counts + # qformat: "sanger" # Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value + ignore_chroms: # patterns of chromosome names to ignore + [X, Y, MT, NC_007605. hs37d5, chrEBV, '*_decoy', 'HLA-*', 'GL000220.*'] # Genome hs37d5 + # [chrX, chrY, chrM, '*_random', 'chrUn_*', chrEBV, '*_decoy', 'HPV*', CMV, HBV, KSHV, MCV, SV40, 'HCV-*', 'HIV-*', 'HTLV-*'] # Genome GRch38.d1.vd1 + extra_args_extract: # Valid arguments: see ?sequenza::sequenza.extract in R + gamma: 60 # scarHRD value + kmin: 50 # scarHRD value + extra_args_fit: # Valid arguments: see ?sequenza::sequenza.fit in R + N.ratio.filter: 10 # scarHRD value + N.BAF.filter: 1 # scarHRD value + segment.filter: 3000000 # scarHRD value + mufreq.treshold: 0.1 # scarHRD value + ratio.priority: False # scarHRD value + ploidy: [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5] copywriter: path_target_regions: REQUIRED # REQUIRED bin_size: 20000 # TODO: make actually configurable @@ -420,6 +442,134 @@ def format_id(*args): return {key: "{{{}}}".format(key) for key in args} +class SequenzaStepPart(SomaticTargetedSeqCnvCallingStepPart): + """Perform somatic targeted CNV calling using sequenza""" + + #: Step name + name = "sequenza" + + #: Class available actions + actions = ( + "install", + "gcreference", + "run", + "report", + ) + + #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). + resource_usage = { + "run": ResourceUsage( + threads=4, + time="24:00:00", # 1 day + memory="16G", + ), + } + + def __init__(self, parent): + super().__init__(parent) + + def get_input_files(self, action): + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + + method_mapping = { + "run": self._get_input_files_run(), + "report": self._get_input_files_report(), + } + return method_mapping[action] + + def _get_input_files_run(self): + @dictify + def input_function(wildcards): + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + normal_base_path = ( + "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( + normal_library=self.get_normal_lib_name(wildcards), **wildcards + ) + ) + tumor_base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format( + **wildcards + ) + yield "gc", "work/static_data/out/sequenza.{length}.wig.gz".format( + length=self.config["sequenza"]["length"], + ) + yield "normal_bam", ngs_mapping(normal_base_path + ".bam") + yield "normal_bai", ngs_mapping(normal_base_path + ".bam.bai") + yield "tumor_bam", ngs_mapping(tumor_base_path + ".bam") + yield "tumor_bai", ngs_mapping(tumor_base_path + ".bam.bai") + + return input_function + + def _get_input_files_report(self): + return { + "packages": "work/R_packages/out/sequenza.done", + "seqz": "work/{mapper}.sequenza.{library_name}/out/{mapper}.sequenza.{library_name}.seqz.gz", + } + + def get_output_files(self, action): + if action == "install": + return {"done": "work/R_packages/out/sequenza.done"} + elif action == "gcreference": + return { + "gc": "work/static_data/out/sequenza.{length}.wig.gz".format( + length=self.config["sequenza"]["length"], + ) + } + elif action == "run": + return { + "seqz": "work/{mapper}.sequenza.{library_name}/out/{mapper}.sequenza.{library_name}.seqz.gz", + "seqz_md5": "work/{mapper}.sequenza.{library_name}/out/{mapper}.sequenza.{library_name}.seqz.gz.md5", + } + elif action == "report": + return {"done": "work/{mapper}.sequenza.{library_name}/report/.done"} + else: + raise UnsupportedActionException( + "Action '{action}' is not supported. Valid options: {valid}".format( + action=action, valid=", ".join(self.actions) + ) + ) + + def get_params(self, action): + self._validate_action(action) + return self._get_params_report + + def _get_params_report(self, wildcards): + return wildcards["library_name"] + + @dictify + def get_log_file(self, action): + """Return dict of log files.""" + if action == "install": + prefix = "work/R_packages/log/sequenza" + elif action == "gcreference": + prefix = "work/static_data/log/sequenza.{length}".format( + length=self.config["sequenza"]["length"], + ) + elif action == "run": + prefix = ( + "work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.run" + ) + elif action == "report": + prefix = ( + "work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.report" + ) + else: + raise UnsupportedActionException( + "Action '{action}' is not supported. Valid options: {valid}".format( + action=action, valid=", ".join(self.actions) + ) + ) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): """Perform somatic targeted CNV calling using cnvkit""" @@ -438,8 +588,11 @@ class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): "report", ) + # Overwrite defaults + default_resource_usage = ResourceUsage(threads=1, time="03:59:59", memory="7680M") # 4h + #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). - resource_usage_dict = { + resource_usage = { "plot": ResourceUsage( threads=1, time="08:00:00", # 1 day @@ -450,11 +603,6 @@ class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): time="08:00:00", # 8 hours memory=f"{16 * 1024}M", ), - "default": ResourceUsage( - threads=1, - time="03:59:59", # 1 day - memory=f"{int(7.5 * 1024)}M", - ), } def __init__(self, parent): @@ -700,23 +848,6 @@ def get_log_file(self, action): log_files[key + "_md5"] = prefix + ext + ".md5" return log_files - def get_resource_usage(self, action): - """Get Resource Usage - - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - - :return: Returns ResourceUsage for step. - """ - # Validate action - self._validate_action(action) - if action == "plot": - return self.resource_usage_dict.get("plot") - elif action == "coverage": - return self.resource_usage_dict.get("coverage") - else: - return self.resource_usage_dict.get("default") - class CopywriterStepPart(SomaticTargetedSeqCnvCallingStepPart): """Perform somatic targeted CNV calling using CopywriteR""" @@ -731,7 +862,7 @@ class CopywriterStepPart(SomaticTargetedSeqCnvCallingStepPart): actions_w_in_out = ("run", "call") #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). - resource_usage_dict = { + resource_usage = { "prepare": ResourceUsage( threads=1, time="02:00:00", # 2 hours @@ -868,18 +999,6 @@ def _get_log_file(self, action): for key, ext in key_ext: yield key, tpl + ext - def get_resource_usage(self, action): - """Get Resource Usage - - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - - :return: Returns ResourceUsage for step. - """ - # Validate action - self._validate_action(action) - return self.resource_usage_dict.get(action) - class SomaticTargetedSeqCnvCallingWorkflow(BaseStep): """Perform somatic targeted sequencing CNV calling""" @@ -915,6 +1034,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) CnvettiOnTargetStepPart, CnvKitStepPart, CopywriterStepPart, + SequenzaStepPart, LinkOutStepPart, ) ) @@ -926,6 +1046,7 @@ def get_result_files(self): """Return list of result files for the somatic targeted sequencing CNV calling step""" tool_actions = { "cnvkit": ["fix", "postprocess", "report", "export"], + "sequenza": ("run",), "copywriter": ("call",), "cnvetti_on_target": ("coverage", "segment", "postprocess"), "cnvetti_off_target": ("coverage", "segment", "postprocess"), @@ -951,7 +1072,7 @@ def get_result_files(self): except AttributeError: tpls = [self.sub_steps[tool].get_output_files(action)] try: - tpls += self.sub_steps[tool].get_log_file(action).values() + tpls += list(self.sub_steps[tool].get_log_file(action).values()) except AttributeError: tpls += [self.sub_steps[tool].get_log_file(action)] for tpl in tpls: @@ -961,7 +1082,7 @@ def get_result_files(self): library_name=[sample_pair.tumor_sample.dna_ngs_library.name], ) for f in filenames: - if ".tmp." not in f: + if ".tmp." not in f and not f.endswith("/.done"): yield f.replace("work/", "output/") def check_config(self): diff --git a/snappy_wrappers/wrappers/r/wrapper.py b/snappy_wrappers/wrappers/r/wrapper.py index a8452b96a..e43570431 100644 --- a/snappy_wrappers/wrappers/r/wrapper.py +++ b/snappy_wrappers/wrappers/r/wrapper.py @@ -26,7 +26,9 @@ to_install[package["repo"]].append(package["name"]) else: to_install["cran"].append(package["name"]) -to_install = {k: "c({})".format(", ".join(['"{}"'.format(vv) for vv in v])) for k, v in to_install.items()} +to_install = { + k: "c({})".format(", ".join(['"{}"'.format(vv) for vv in v])) for k, v in to_install.items() +} shell.executable("/bin/bash") diff --git a/snappy_wrappers/wrappers/sequenza/environment.yaml b/snappy_wrappers/wrappers/sequenza/environment.yaml new file mode 100644 index 000000000..b6a8c8b49 --- /dev/null +++ b/snappy_wrappers/wrappers/sequenza/environment.yaml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python =3.9 + - sequenza-utils + - r-sequenza + - r-devtools + - r-data.table + - samtools diff --git a/snappy_wrappers/wrappers/sequenza/gcreference/environment.yaml b/snappy_wrappers/wrappers/sequenza/gcreference/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/sequenza/gcreference/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/sequenza/gcreference/wrapper.py b/snappy_wrappers/wrappers/sequenza/gcreference/wrapper.py new file mode 100644 index 000000000..0b230a26d --- /dev/null +++ b/snappy_wrappers/wrappers/sequenza/gcreference/wrapper.py @@ -0,0 +1,50 @@ +"""CUBI+Snakemake wrapper code for sequenza GC reference file +""" + +import os + +from snakemake import shell + +__author__ = "Eric Blanc " + +step = snakemake.config["pipeline_step"]["name"] +genome = snakemake.config["static_data_config"]["reference"]["path"] +length = snakemake.config["step_config"][step]["sequenza"]["length"] + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +sequenza-utils gc_wiggle --fasta {genome} -w {length} -o {snakemake.output.gc} + +pushd $(dirname {snakemake.output.gc}) +md5sum $(basename {snakemake.output.gc}) > $(basename {snakemake.output.gc}).md5 +popd +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/sequenza/report/environment.yaml b/snappy_wrappers/wrappers/sequenza/report/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/sequenza/report/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/sequenza/report/wrapper.py b/snappy_wrappers/wrappers/sequenza/report/wrapper.py new file mode 100644 index 000000000..28c1b18a6 --- /dev/null +++ b/snappy_wrappers/wrappers/sequenza/report/wrapper.py @@ -0,0 +1,98 @@ +"""CUBI+Snakemake wrapper code for sequenza (R part, post-processing) +""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snakemake import shell + +from snappy_wrappers.tools.genome_windows import yield_contigs + +__author__ = "Eric Blanc " + + +def config_to_r(x): + if x is None: + return "NULL" + if isinstance(x, str): + return f'"{x}"' + if isinstance(x, bool): + return "TRUE" if x else "FALSE" + if isinstance(x, list): + return "c({})".format(", ".join([config_to_r(xx) for xx in x])) + if isinstance(x, dict): + return "list({})".format( + ", ".join(['"{}"={}'.format(k, config_to_r(v)) for k, v in x.items()]) + ) + return str(x) + + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["sequenza"] +genome = snakemake.config["static_data_config"]["reference"]["path"] + +f = open(genome + ".fai", "rt") +contigs = config_to_r(list(yield_contigs(f, config.get("ignore_chroms")))) +f.close() + +args_extract = config_to_r(dict(config["extra_args_extract"])) +args_fit = config_to_r(dict(config["extra_args_fit"])) + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +export R_LIBS_USER=$(dirname {snakemake.input.packages}) + +R --vanilla --slave << __EOF +library(sequenza) + +args <- list(file="{snakemake.input.seqz}", assembly="{config[assembly]}", chromosome.list={contigs}) +args <- c(args, {args_extract}) +seqz <- do.call(sequenza.extract, args=args) + +args <- list(sequenza.extract=seqz, chromosome.list={contigs}, mc.cores=1) +args <- c(args, {args_fit}) +CP <- do.call(sequenza.fit, args=args) + +sequenza.results(sequenza.extract=seqz, cp.table=CP, sample.id="{snakemake.params[sample_id]}", out.dir=dirname("{snakemake.output.done}")) + +__EOF + +pushd $(dirname {snakemake.output.done}) ; for f in $(ls) ; do md5sum $f > $f.md5 ; done ; popd + +touch {snakemake.output.done} +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/sequenza/run/environment.yaml b/snappy_wrappers/wrappers/sequenza/run/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/sequenza/run/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/sequenza/run/wrapper.py b/snappy_wrappers/wrappers/sequenza/run/wrapper.py new file mode 100644 index 000000000..13fa1f64d --- /dev/null +++ b/snappy_wrappers/wrappers/sequenza/run/wrapper.py @@ -0,0 +1,71 @@ +"""CUBI+Snakemake wrapper code for sequenza (sequenza-utils, pileups) +""" + +import os +import sys + +# The following is required for being able to import snappy_wrappers modules +# inside wrappers. These run in an "inner" snakemake process which uses its +# own conda environment which cannot see the snappy_pipeline installation. +base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) +sys.path.insert(0, base_dir) + +from snakemake import shell + +from snappy_wrappers.tools.genome_windows import yield_contigs + +__author__ = "Eric Blanc " + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["sequenza"] +genome = snakemake.config["static_data_config"]["reference"]["path"] +length = config["length"] + +f = open(genome + ".fai", "rt") +contigs = " ".join(yield_contigs(f, config.get("ignore_chroms"))) +f.close() + +extra_arguments = " ".join( + ["--{} {}".format(k, v) for k, v in config.get("extra_arguments", {}).items()] +) + +shell.executable("/bin/bash") + +shell( + r""" +set -x + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +sequenza-utils bam2seqz \ + -gc {snakemake.input.gc} --fasta {genome} \ + -n {snakemake.input.normal_bam} --tumor {snakemake.input.tumor_bam} \ + -C {contigs} {extra_arguments} \ + | sequenza-utils seqz_binning -s - \ + -w {length} -o {snakemake.output.seqz} + +pushd $(dirname {snakemake.output.seqz}) ; f=$(basename {snakemake.output.seqz}) ; md5sum $f > $f.md5 ; popd +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index ff622624c..527556a40 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -46,6 +46,7 @@ def minimal_config(): - cnvetti_on_target - cnvkit - copywriter + - sequenza cnvkit: path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed @@ -811,13 +812,134 @@ def test_copywriter_step_part_get_resource_usage_call(somatic_targeted_seq_cnv_c assert actual == expected, msg_error +# Tests for SequenzaStepPart ---------------------------------------------------------------------- + + +def test_sequenza_step_part_get_output_files_install(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_output_files() - action 'install'""" + expected = {"done": "work/R_packages/out/sequenza.done"} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("sequenza", "install") + assert actual == expected + + +def test_sequenza_step_part_get_log_file_install(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_log_file() - action 'install'""" + base_name = "work/R_packages/log/sequenza" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("sequenza", "install") + assert actual == expected + + +def test_sequenza_step_part_get_output_files_gcreference(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_output_files() - action 'gcreference'""" + expected = {"gc": "work/static_data/out/sequenza.50.wig.gz"} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("sequenza", "gcreference") + assert actual == expected + + +def test_sequenza_step_part_get_log_file_gcreference(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_log_file() - action 'gcreference'""" + base_name = "work/static_data/log/sequenza.50" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("sequenza", "gcreference") + assert actual == expected + + +def test_sequenza_step_part_get_input_files_run(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_input_files() - action 'run'""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = { + "gc": "work/static_data/out/sequenza.50.wig.gz", + "normal_bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "normal_bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "tumor_bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + "tumor_bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", + } + actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("sequenza", "run")(wildcards) + assert actual == expected + + +def test_sequenza_step_part_get_output_files_run(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_output_files() - action 'run'""" + expected = { + "seqz": "work/{mapper}.sequenza.{library_name}/out/{mapper}.sequenza.{library_name}.seqz.gz", + "seqz_md5": "work/{mapper}.sequenza.{library_name}/out/{mapper}.sequenza.{library_name}.seqz.gz.md5", + } + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("sequenza", "run") + assert actual == expected + + +def test_sequenza_step_part_get_log_file_run(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_log_file() - action 'run'""" + base_name = "work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.run" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("sequenza", "run") + assert actual == expected + + +def test_sequenza_step_part_get_input_files_report(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_input_files() - action 'report'""" + expected = { + "packages": "work/R_packages/out/sequenza.done", + "seqz": "work/{mapper}.sequenza.{library_name}/out/{mapper}.sequenza.{library_name}.seqz.gz", + } + actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("sequenza", "report") + assert actual == expected + + +def test_sequenza_step_part_get_output_files_report(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_output_files() - action 'report'""" + expected = {"done": "work/{mapper}.sequenza.{library_name}/report/.done"} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("sequenza", "report") + assert actual == expected + + +def test_sequenza_step_part_get_params_report(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_params() - action 'report'""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = "P001-T1-DNA1-WGS1" + actual = somatic_targeted_seq_cnv_calling_workflow.get_params("sequenza", "report")(wildcards) + assert actual == expected + + +def test_sequenza_step_part_get_log_file_report(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_log_file() - action 'report'""" + base_name = "work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.report" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("sequenza", "report") + assert actual == expected + + +def test_sequenza_step_part_get_resource_usage_call(somatic_targeted_seq_cnv_calling_workflow): + """Tests SequenzaStepPart.get_resource_usage()""" + # Define expected + expected_dicts = { + "run": {"threads": 4, "time": "24:00:00", "memory": "16G", "partition": "medium"}, + } + # Evaluate + for action, resources in expected_dicts.items(): + for resource, expected in resources.items(): + msg_error = f"Assertion error for resource '{resource}' in '{action}'." + actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( + "sequenza", action, resource + ) + assert actual == expected, msg_error + + # Tests for SomaticTargetedSeqCnvCallingWorkflow -------------------------------------------------- def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_calling_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["cnvetti_off_target", "cnvetti_on_target", "cnvkit", "copywriter", "link_out"] + expected = [ + "cnvetti_off_target", + "cnvetti_on_target", + "cnvkit", + "copywriter", + "link_out", + "sequenza", + ] actual = list(sorted(somatic_targeted_seq_cnv_calling_workflow.sub_steps.keys())) assert actual == expected @@ -928,6 +1050,16 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call "segments.txt.md5", ) ] + # sequenza + tpl = ( + "output/bwa.sequenza.P00{i}-T{t}-DNA1-WGS1/out/" + "bwa.sequenza.P00{i}-T{t}-DNA1-WGS1.seqz.gz{checksum}" + ) + expected += [ + tpl.format(i=i, t=t, checksum=checksum) + for i, t in ((1, 1), (2, 1), (2, 2)) + for checksum in ("", ".md5") + ] expected = list(sorted(expected)) actual = list(sorted(somatic_targeted_seq_cnv_calling_workflow.get_result_files())) # HACK TODO