bihealth · giacuong171 · Jun 24, 2023 · Jun 25, 2023 · Jun 30, 2023 · Jul 3, 2023
diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
@@ -44,6 +44,7 @@
     somatic_variant_expression,
     somatic_variant_filtration,
     somatic_variant_signatures,
+    somatic_variant_qc,
     somatic_wgs_cnv_calling,
     somatic_wgs_sv_calling,
     sv_calling_targeted,
@@ -99,6 +100,7 @@
     "somatic_variant_expression": somatic_variant_expression,
     "somatic_variant_filtration": somatic_variant_filtration,
     "somatic_variant_signatures": somatic_variant_signatures,
+    "somatic_variant_qc": somatic_variant_qc,
     "somatic_wgs_cnv_calling": somatic_wgs_cnv_calling,
     "somatic_wgs_sv_calling": somatic_wgs_sv_calling,
     "sv_calling_targeted": sv_calling_targeted,

diff --git a/snappy_pipeline/workflows/somatic_variant_qc/Snakefile b/snappy_pipeline/workflows/somatic_variant_qc/Snakefile
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""CUBI Pipeline tumor mutational burden step Snakefile"""
+
+import os
+
+from snappy_pipeline import expand_ref
+from snappy_pipeline.workflows.somatic_variant_qc import (
+    SomaticVariantQCWorkflow,
+)
+
+__author__ = "Gia Cuong Pham"
+__email__ = "[email protected]"
+
+
+# Configuration ===============================================================
+
+
+configfile: "config.yaml"
+
+
+# Expand "$ref" JSON pointers in configuration (also works for YAML)
+config, lookup_paths, config_paths = expand_ref("config.yaml", config)
+
+# WorkflowImpl Object Setup ===================================================
+wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
+
+
+localrules:
+    # Linking files from work/ to output/ should be done locally
+    somatic_variant_qc_link_out_run,
+
+
+rule all:
+    input:
+        wf.get_result_files(),
+
+
+# Generic linking out ---------------------------------------------------------
+
+
+rule somatic_variant_qc_link_out_run:
+    input:
+        wf.get_input_files("link_out", "run"),
+    output:
+        wf.get_output_files("link_out", "run"),
+    run:
+        shell(wf.get_shell_cmd("link_out", "run", wildcards))
+
+
+rule somatic_variant_qc:
+    input:
+        **wf.get_input_files("SVQC_step", "run"),
+    output:
+        **wf.get_output_files("SVQC_step", "run"),
+    threads: wf.get_resource("SVQC_step", "run", "threads")
+    resources:
+        time=wf.get_resource("SVQC_step", "run", "time"),
+        memory=wf.get_resource("SVQC_step", "run", "memory"),
+        partition=wf.get_resource("SVQC_step", "run", "partition"),
+        tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("SVQC_step", "run"),
+    wrapper:
+        wf.wrapper_path("somatic_variants_qc")
diff --git a/snappy_pipeline/workflows/somatic_variant_qc/__init__.py b/snappy_pipeline/workflows/somatic_variant_qc/__init__.py
@@ -0,0 +1,223 @@
+from collections import OrderedDict
+import os
+import sys
+
+from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
+from snakemake.io import expand
+
+from snappy_pipeline.base import UnsupportedActionException
+from snappy_pipeline.utils import dictify, listify
+from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
+from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage
+from snappy_pipeline.workflows.somatic_variant_calling import (
+    SOMATIC_VARIANT_CALLERS_MATCHED,
+    SomaticVariantCallingWorkflow,
+)
+
+#: Extensions of files to create as main payload
+EXT_VALUES = (".json", ".json.md5")
+
+#: Names of the files to create for the extension
+EXT_NAMES = ("json", "json_md5")
+
+#: Default configuration for the tmb calculation step
+DEFAULT_CONFIG = r"""
+step_config:
+    somatic_variant_qc:
+        path_somatic_variant_calling: ../somatic_variant_calling   # REQUIRED
+        tools_ngs_mapping: []      # default to those configured for ngs_mapping
+        tools_somatic_variant_calling: []  # default to those configured for somatic_variant_calling
+        target_regions: # REQUIRED
+        padding: 0  #Used for count the number of variants outside of exom + padding
+        repeated_regions: [] #need to confirm
+"""
+
+
+class SomaticVariantQCStepPart(BaseStepPart):
+    """"""
+
+    name = "SVQC_step"
+
+    actions = ("run",)
+
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.tumor_ngs_library_to_sample_pair = OrderedDict()
+        for sheet in self.parent.shortcut_sheets:
+            # update function of OrderedDict
+            self.tumor_ngs_library_to_sample_pair.update(
+                sheet.all_sample_pairs_by_tumor_dna_ngs_library
+            )
+        # Build mapping from donor name to donor.
+        self.donors = OrderedDict()
+        for sheet in self.parent.shortcut_sheets:
+            for donor in sheet.donors:
+                self.donors[donor.name] = donor
+
+    @dictify
+    def get_input_files(self, action):
+        self._validate_action(action)
+        tpl = (
+            "output/{mapper}.{var_caller}.{tumor_library}/out/"
+            "{mapper}.{var_caller}.{tumor_library}"
+        )
+        key_ext = {
+            "full_vcf": ".full.vcf.gz",
+            "full_vcf_tbi": ".full.vcf.gz.tbi",
+            "passed_vcf": ".vcf.gz",
+            "passed_vcf_tbi": ".vcf.gz.tbi",
+        }
+        variant_calling = self.parent.sub_workflows["somatic_variant_calling"]
+        for key, ext in key_ext.items():
+            yield key, variant_calling(tpl + ext)
+
+    @dictify
+    def get_output_files(self, action):
+        # Validate action
+        self._validate_action(action)
+        prefix = (
+            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/"
+            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+        )
+        key_ext = {"json": ".json"}
+        for key, ext in key_ext.items():
+            yield key, prefix + ext
+            yield key + "_md5", prefix + ext + ".md5"
+
+    @dictify
+    def _get_log_file(self, action):
+        assert action == "run"
+        prefix = (
+            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/"
+            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+        )
+
+        key_ext = (
+            ("log", ".log"),
+            ("conda_info", ".conda_info.txt"),
+            ("conda_list", ".conda_list.txt"),
+        )
+        for key, ext in key_ext:
+            yield key, prefix + ext
+
+    def get_resource_usage(self, action):
+        """Get Resource Usage
+        :param action: Action (i.e., step) in the workflow, example: 'run'.
+        :type action: str
+        :return: Returns ResourceUsage for step.
+        :raises UnsupportedActionException: if action not in class defined list of valid actions.
+        """
+        if action not in self.actions:
+            actions_str = ", ".join(self.actions)
+            error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
+            raise UnsupportedActionException(error_message)
+        mem_mb = 4 * 1024  # 4GB
+        return ResourceUsage(
+            threads=4,
+            time="1:00:00",  # 1 hour
+            memory=f"{mem_mb}M",
+        )
+
+
+class SomaticVariantQCWorkflow(BaseStep):
+    """Perform gathering variant information"""
+
+    name = "somaticvariantqc"
+    sheet_shortcut_class = CancerCaseSheet
+    sheet_shortcut_kwargs = {
+        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
+    }
+
+    @classmethod
+    def default_config_yaml(cls):
+        """Return default config YAML, to be overwritten by project-specific one."""
+        return DEFAULT_CONFIG
+
+    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
+        super().__init__(
+            workflow,
+            config,
+            config_lookup_paths,
+            config_paths,
+            workdir,
+            (SomaticVariantCallingWorkflow, NgsMappingWorkflow),
+        )
+        # Register sub step classes so the sub steps are available
+        self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart))
+        # Register sub workflows
+        self.register_sub_workflow(
+            "somatic_variant_calling",
+            self.w_config["step_config"]["somatic_variant_qc"]["path_somatic_variant_calling"],
+        )
+        # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
+        if not self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"]:
+            self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"] = self.w_config[
+                "step_config"
+            ]["ngs_mapping"]["tools"]["dna"]
+        if not self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]:
+            self.w_config["step_config"]["somatic_variant_qc"][
+                "tools_somatic_variant_calling"
+            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
+
+    @listify
+    def get_result_files(self):
+        callers = set(
+            self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]
+        )
+        name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}"
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"],
+            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            ext=EXT_VALUES,
+        )
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"],
+            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            ext=(
+                ".log",
+                ".log.md5",
+                ".conda_info.txt",
+                ".conda_info.txt.md5",
+                ".conda_list.txt",
+                ".conda_list.txt.md5",
+            ),
+        )
+
+    def _yield_result_files_matched(self, tpl, **kwargs):
+        """Build output paths from path template and extension list.
+
+        This function returns the results from the matched somatic variant callers such as
+        Mutect.
+        """
+        for sheet in filter(is_not_background, self.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if (
+                    not sample_pair.tumor_sample.dna_ngs_library
+                    or not sample_pair.normal_sample.dna_ngs_library
+                ):
+                    msg = (
+                        "INFO: sample pair for cancer bio sample {} has is missing primary"
+                        "normal or primary cancer NGS library"
+                    )
+                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
+                    continue
+                yield from expand(
+                    tpl,
+                    tumor_library=[sample_pair.tumor_sample.dna_ngs_library],
+                    **kwargs,
+                )
+
+    def check_config(self):
+        """Check that the path to the NGS mapping is present"""
+        self.ensure_w_config(
+            ("step_config", "somatic_variant_qc", "path_somatic_variant_calling"),
+            "Path to variant calling not configured but required for somatic variant qc",
+        )
+
+        self.ensure_w_config(
+            ("step_config", "somatic_variant_qc", "target_regions"),
+            "Path to target_regions file (bed format)"
+            "not configured but required for somatic variant qc",
+        )
diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile
@@ -8,7 +8,8 @@ from snappy_pipeline.workflows.tumor_mutational_burden import (
     TumorMutationalBurdenCalculationWorkflow,
 )
 
-__author__ = ""
+__author__ = "Gia Cuong Pham"
+__email__ = "[email protected]"
 
 
 # Configuration ===============================================================

diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
@@ -46,10 +46,10 @@
 "Number_snvs": $number_snvs,
 "Number_indels": $number_indels,
 "Total_regions_length": $total_exom_length
+}
 EOF
 pushd $(dirname {snakemake.output.json})
 md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5})
-}
 """
 )
 

diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml b/snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - cyvcf2
+  - pip=20.0.2
+  - pip:
+    - pytabix