Merge branch '414-defining-tumor-purityploidy' of github.com:bihealth…

…/snappy-pipeline into 414-defining-tumor-purityploidy
bihealth · Aug 30, 2023 · c6ba0a5 · c6ba0a5
2 parents 595ce3d + c1a215e
commit c6ba0a5
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 9 deletions.
diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile
@@ -294,6 +294,8 @@ rule somatic_targeted_seq_cnv_calling_sequenza_report:
         **wf.get_input_files("sequenza", "report"),
     output:
         **wf.get_output_files("sequenza", "report"),
+    params:
+        sample_id=wf.get_params("sequenza", "report"),
     threads: wf.get_resource("sequenza", "report", "threads")
     resources:
         time=wf.get_resource("sequenza", "report", "time"),

diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
@@ -121,7 +121,7 @@
 # Default configuration somatic_targeted_seq_cnv_calling
 step_config:
   somatic_targeted_seq_cnv_calling:
-    tools: ['cnvkit']  # REQUIRED - available: 'cnvkit', 'sequenza', 'cnvetti_on_target' and 'cnvetti_off_target'
+    tools: ['cnvkit']  # REQUIRED - available: 'cnvkit', 'sequenza', 'cnvetti_on_target', 'cnvetti_off_target' and 'copywriter' (deprecated)
     path_ngs_mapping: ../ngs_mapping  # REQUIRED
     cnvkit:
       path_target: REQUIRED             # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed
@@ -165,14 +165,25 @@
       smooth_bootstrap: False           # [segmetrics] Smooth bootstrap results
     sequenza:
       length: 50
-      extra_args: {} # Extra arguments for sequenza bam2seqz, valid values:
-      # hom: 0.9     # Threshold to select homozygous positions
-      # het: 0.25    # Threshold to select heterozygous positions
-      # qlimit: 20   # Minimum nucleotide quality score for inclusion in the counts
-      # qformat: "sanger" # Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value
+      assembly: "hg19"     # Must be hg38 for GRCh38. See copynumber for complete list (augmented with hg38)
+      extra_args: {}       # Extra arguments for sequenza bam2seqz, valid values:
+      # hom: 0.9           # Threshold to select homozygous positions
+      # het: 0.25          # Threshold to select heterozygous positions
+      # qlimit: 20         # Minimum nucleotide quality score for inclusion in the counts
+      # qformat: "sanger"  # Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value
       ignore_chroms: # patterns of chromosome names to ignore
         [X, Y, MT, NC_007605. hs37d5, chrEBV, '*_decoy', 'HLA-*', 'GL000220.*']  # Genome hs37d5
       # [chrX, chrY, chrM, '*_random', 'chrUn_*', chrEBV, '*_decoy', 'HPV*', CMV, HBV, KSHV, MCV, SV40, 'HCV-*', 'HIV-*', 'HTLV-*']  # Genome GRch38.d1.vd1
+      extra_args_extract:        # Valid arguments: see ?sequenza::sequenza.extract in R
+        gamma: 60                # scarHRD value
+        kmin: 50                 # scarHRD value
+      extra_args_fit:            # Valid arguments: see ?sequenza::sequenza.fit in R
+        N.ratio.filter: 10       # scarHRD value
+        N.BAF.filter: 1          # scarHRD value
+        segment.filter: 3000000  # scarHRD value
+        mufreq.treshold: 0.1     # scarHRD value
+        ratio.priority: False    # scarHRD value
+        ploidy: [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5]
     copywriter:
       path_target_regions: REQUIRED # REQUIRED
       bin_size: 20000 # TODO: make actually configurable
@@ -519,6 +530,13 @@ def get_output_files(self, action):
                 )
             )
 
+    def get_params(self, action):
+        self._validate_action(action)
+        return self._get_params_report
+
+    def _get_params_report(self, wildcards):
+        return wildcards["library_name"]
+
     @dictify
     def get_log_file(self, action):
         """Return dict of log files."""

diff --git a/snappy_wrappers/wrappers/sequenza/report/wrapper.py b/snappy_wrappers/wrappers/sequenza/report/wrapper.py
@@ -2,11 +2,48 @@
 """
 
 import os
+import sys
+
+# The following is required for being able to import snappy_wrappers modules
+# inside wrappers.  These run in an "inner" snakemake process which uses its
+# own conda environment which cannot see the snappy_pipeline installation.
+base_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
+sys.path.insert(0, base_dir)
 
 from snakemake import shell
 
+from snappy_wrappers.tools.genome_windows import yield_contigs
+
 __author__ = "Eric Blanc <[email protected]>"
 
+
+def config_to_r(x):
+    if x is None:
+        return "NULL"
+    if isinstance(x, str):
+        return f'"{x}"'
+    if isinstance(x, bool):
+        return "TRUE" if x else "FALSE"
+    if isinstance(x, list):
+        return "c({})".format(", ".join([config_to_r(xx) for xx in x]))
+    if isinstance(x, dict):
+        return "list({})".format(
+            ", ".join(['"{}"={}'.format(k, config_to_r(v)) for k, v in x.items()])
+        )
+    return str(x)
+
+
+step = snakemake.config["pipeline_step"]["name"]
+config = snakemake.config["step_config"][step]["sequenza"]
+genome = snakemake.config["static_data_config"]["reference"]["path"]
+
+f = open(genome + ".fai", "rt")
+contigs = config_to_r(list(yield_contigs(f, config.get("ignore_chroms"))))
+f.close()
+
+args_extract = config_to_r(dict(config["extra_args_extract"]))
+args_fit = config_to_r(dict(config["extra_args_fit"]))
+
 shell.executable("/bin/bash")
 
 shell(
@@ -35,13 +72,21 @@
 R --vanilla --slave << __EOF
 library(sequenza)
 
-seqz <- sequenza.extract("{snakemake.input.seqz}")
-CP <- sequenza.fit(seqz)
+args <- list(file="{snakemake.input.seqz}", assembly="{config[assembly]}", chromosome.list={contigs})
+args <- c(args, {args_extract})
+seqz <- do.call(sequenza.extract, args=args)
+
+args <- list(sequenza.extract=seqz, chromosome.list={contigs}, mc.cores=1)
+args <- c(args, {args_fit})
+CP <- do.call(sequenza.fit, args=args)
+
 sequenza.results(sequenza.extract=seqz, cp.table=CP, sample.id="{snakemake.params[sample_id]}", out.dir=dirname("{snakemake.output.done}"))
 
 __EOF
 
 pushd $(dirname {snakemake.output.done}) ; for f in $(ls) ; do md5sum $f > $f.md5 ; done ; popd
+
+touch {snakemake.output.done}
 """
 )
 

diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py
@@ -879,7 +879,6 @@ def test_sequenza_step_part_get_log_file_run(somatic_targeted_seq_cnv_calling_wo
 
 def test_sequenza_step_part_get_input_files_report(somatic_targeted_seq_cnv_calling_workflow):
     """Tests SequenzaStepPart.get_input_files() - action 'report'"""
-    # wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"})
     expected = {
         "packages": "work/R_packages/out/sequenza.done",
         "seqz": "work/{mapper}.sequenza.{library_name}/out/{mapper}.sequenza.{library_name}.seqz.gz",
@@ -895,6 +894,14 @@ def test_sequenza_step_part_get_output_files_report(somatic_targeted_seq_cnv_cal
     assert actual == expected
 
 
+def test_sequenza_step_part_get_params_report(somatic_targeted_seq_cnv_calling_workflow):
+    """Tests SequenzaStepPart.get_params() - action 'report'"""
+    wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"})
+    expected = "P001-T1-DNA1-WGS1"
+    actual = somatic_targeted_seq_cnv_calling_workflow.get_params("sequenza", "report")(wildcards)
+    assert actual == expected
+
+
 def test_sequenza_step_part_get_log_file_report(somatic_targeted_seq_cnv_calling_workflow):
     """Tests SequenzaStepPart.get_log_file() - action 'report'"""
     base_name = "work/{mapper}.sequenza.{library_name}/log/{mapper}.sequenza.{library_name}.report"