From ad07ebfc98e6a024b61724fa35151024eed9f9b7 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Sat, 24 Jun 2023 21:19:16 +0200
Subject: [PATCH 01/17] Adding wrapper for somatic variants QC

---
 .../somatic_variants_qc/enviroment.yaml       |   8 +
 .../somatic_variants_qc/summarize-vcf.py      | 265 ++++++++++++++++++
 .../wrappers/somatic_variants_qc/wrapper.py   |  40 +++
 3 files changed, 313 insertions(+)
 create mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml
 create mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
 create mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py

diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml b/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml
new file mode 100644
index 000000000..beb5b1f3f
--- /dev/null
+++ b/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - cyvcf2
+  - pip=20.0.2
+  - pip:
+    - pytabix
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
new file mode 100644
index 000000000..fb8ad450f
--- /dev/null
+++ b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
@@ -0,0 +1,265 @@
+# import packages
+
+import json  # for writing out the json file
+from cyvcf2 import VCF  # for reading vcf file
+import tabix
+from optparse import OptionParser
+
+
+parser = OptionParser()
+parser.add_option("--rawvcf", help="raw vcf file")
+parser.add_option("--filtered-vcf", help="filtered vcf file")
+parser.add_option(
+    "--exom-bedfile",
+    help="exom regions bed file",
+)
+parser.add_option("--padding", help="it's padding", default=0)
+parser.add_option(
+    "--repeated-region-bedfile",
+    help="bed file contains repeated or difficult to map regions",
+)
+parser.add_option(
+    "--output",
+    help="json file contains information about vcf file",
+)
+(options, args) = parser.parse_args()
+
+# from pybedtools import Interval,BedTool
+
+# I will use for loop here
+# It could be better by using multi threading.
+# TODO:
+# - Number of variants called (from the `*.full.vcf.gz`file) [X] Get all variants from full file
+# - Number of variants that pass all filters [X] Get all variants from fitlered file
+# - Number of variants passing the filters that are inside & outside of the exome bed file (with padding) [X]
+# - Number of variants in repeated or other difficult to map regions [ ] Need a bed file containing repeated
+# - Number of SNVs & indels [X]
+# - Indel lengths statistics [X]
+# - Number of variants with more than 1 ALT allele (I _think_ that this value should be 0 for all variants passing the filters) [ ]
+# - Number of SNVs with minimal support (only one read supporting the variant), possibly separating inside & outside exome bed [X]
+# - Number of SNVs with limited support (5 reads or less supporting the variant), possibly separating inside & outside exome bed [X]
+# - Number of SNVs with strong support (at least 10 reads, VAF greater or equal to 10%) [X] VAF need to be joined
+# - Number of SNVs in each mutation class (C>A, C>G, C>T, T>A, T>C, T>G), for minimal, limited, average & strong support [X] not for minimal,limited,average yet
+# - Metrics on strand bias could also be collected (F1R2 & F2R1 vcf FORMAT) [ ] PAUSE
+# - the VAF distribution. Perhaps we can split the variants (both SNVs & indels) into those with very little support (<1%), subclonal (<10%), the main part (<40%), affected by CNV (>40%)[ ]
+
+
+# Adding A>C G>T grouping
+# Table for VAF
+# Genotype check > report different genotypes
+# Checking number multi allelic
+# Support alternative allel in the normal
+# #alt in normal + average ALT + max ALT
+# snappy pipeline for this
+
+##### SNAPPY PIPELINE
+## WRAPPER
+# - Checking the configuration file
+# - Checking input file
+## PIPELINE
+# - __init__ :
+
+
+def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0):
+    # print(f'variant chrom: {v_chrom} - start: {v_start} - end:{v_end}')
+    record = bed_intervals.query(v_chrom, v_start - padding, v_end + padding)
+    if record:
+        return True
+    else:
+        return False
+
+
+# def calculate_exoms_length(bed_intervals):
+#     contigs= ["chr" + str(i) for i in range(1, 23)]
+#     contigs.append("chrX")
+#     contigs.append("chrY")
+#     length = 0
+#     for contig in contigs:
+#         records = bed_intervals.query(contig,0,250000000)
+#         for record in records:
+#             length = length + int(record[2]) - int(record[1])
+#     return length
+
+
+def get_variant_type(ref, alt):
+    if (len(alt) == 1) and (len(ref) == len(alt)):
+        return "snp"
+    elif len(alt) != len(ref):
+        return "indels"
+
+
+def check_sp_read(variant):
+    if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
+        dp = variant.INFO["DP"]
+        if dp == 1:
+            return "minimal"
+        elif (dp > 1) and (dp <= 5):
+            return "limited"
+        if dp >= 10:
+            return "strong"
+
+
+def assign_class_snvs(variant, mt_mat):
+    temp = str(variant.REF) + ">" + str(variant.ALT[0])
+    # Following result from plot-vcfstats with bcftools
+    match temp:
+        case "A>C" | "C>A":
+            mt_mat[0] += 1
+        case "A>G" | "G>A":
+            mt_mat[1] += 1
+        case "A>T" | "T>A":
+            mt_mat[2] += 1
+        case "C>G" | "G>C":
+            mt_mat[3] += 1
+        case "C>T" | "T>C":
+            mt_mat[4] += 1
+        case "G>T" | "T>G":
+            mt_mat[5] += 1
+    return mt_mat
+
+
+def process_vcf_file(vcf_file, bed_file="", filter_file=False, padding=0):
+    total_v_count = 0
+    if filter_file:
+        # VAF
+        vaf = []
+        # numb of variants in and out of exoms
+        v_inside_exom = 0
+        v_outside_exom = 0
+        # numb of snps and indels
+        n_snps = 0
+        n_indels = 0
+        indels_length = []
+        # support read informations
+        minimal_rp_snvs_exom = 0
+        minimal_rp_snvs_nexom = 0
+        limited_rp_snvs_exom = 0
+        limited_rp_snvs_nexom = 0
+        strong_rp_snvs_exom = 0
+        strong_rp_snvs_nexom = 0
+        # mutation classes
+        mt_classes = [0] * 6
+        for variant in vcf_file:
+            vaf.append(variant.format("AF")[1])
+            total_v_count += 1
+            # Counting variants in or out of exom and variants with different support reads
+            if "chrUn" in variant.CHROM:
+                v_outside_exom += 1
+                if check_sp_read(variant) == "minimal":
+                    minimal_rp_snvs_nexom += 1
+                elif check_sp_read(variant) == "limited":
+                    limited_rp_snvs_nexom += 1
+                else:
+                    strong_rp_snvs_nexom += 1
+            elif check_variant_in_bed(
+                variant.CHROM, variant.start, variant.end, bed_file, padding=0
+            ):
+                v_inside_exom += 1
+                if check_sp_read(variant) == "minimal":
+                    minimal_rp_snvs_exom += 1
+                elif check_sp_read(variant) == "limited":
+                    limited_rp_snvs_exom += 1
+                else:
+                    strong_rp_snvs_exom += 1
+
+            else:
+                v_outside_exom += 1
+                if check_sp_read(variant) == "minimal":
+                    minimal_rp_snvs_nexom += 1
+                elif check_sp_read(variant) == "limited":
+                    limited_rp_snvs_nexom += 1
+                else:
+                    strong_rp_snvs_nexom += 1
+            # get number of snvs and indels
+            # Need to check multi allelic. Users shouldn't input multi allelic vcf file.
+            if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
+                n_snps += 1
+                mt_classes = assign_class_snvs(variant, mt_classes)
+            else:
+                # More for indels
+                n_indels += 1
+                indels_length.append(abs(len(variant.REF) - len(variant.ALT[0])))
+
+        return (
+            total_v_count,
+            v_inside_exom,
+            v_outside_exom,
+            n_snps,
+            n_indels,
+            indels_length,
+            minimal_rp_snvs_exom,
+            minimal_rp_snvs_nexom,
+            limited_rp_snvs_exom,
+            limited_rp_snvs_nexom,
+            strong_rp_snvs_exom,
+            strong_rp_snvs_nexom,
+            mt_classes,
+            vaf,
+        )
+    else:
+        for variant in vcf_file:
+            total_v_count += 1
+        return total_v_count
+
+
+def main():
+    classes = ["A>C", "A>G", "A>T", "C>G", "C>T", "G>T"]
+    path_full_vcf = options.rawvcf
+    path_filtered_vcf = options.filtered_vcf
+    path_bedfile = options.exom_bedfile
+    outpath = options.output
+    full_vcf = VCF(path_full_vcf)
+    filter_vcf = VCF(path_filtered_vcf)
+    # header = filter_vcf.raw_header
+    sample = filter_vcf.samples[1]
+    # read bed file
+    bed_intervals = tabix.open(path_bedfile)
+    full_v_count = process_vcf_file(full_vcf)
+    (
+        passed_v_count,
+        v_inside_exom,
+        v_outside_exom,
+        n_snps,
+        n_indels,
+        indels_length,
+        minimal_rp_snvs_exom,
+        minimal_rp_snvs_nexom,
+        limited_rp_snvs_exom,
+        limited_rp_snvs_nexom,
+        strong_rp_snvs_exom,
+        strong_rp_snvs_nexom,
+        mt_classes,
+        vaf,
+    ) = process_vcf_file(filter_vcf, bed_intervals, filter_file=True, padding=options.padding)
+
+    # After one for loop through vcf file. The VCF will automatically be closed.
+    # length_exoms = calculate_exoms_length(bed_intervals)
+    # for variant in filter_vcf:
+    #     print(variant.format("AF"))
+    summary = {
+        "sample": sample,
+        "number_full_variants": full_v_count,
+        "number_passed_variants": passed_v_count,
+        "number_snvs_inside_exom": v_inside_exom,
+        "number_snvs_outside_exom": v_outside_exom,
+        "number_snps": n_snps,
+        "number_indels": n_indels,
+        "indels_length": indels_length,
+        "minimal_rp_snvs_exom": minimal_rp_snvs_exom,
+        "minimal_rp_snvs_nexom": minimal_rp_snvs_nexom,
+        "limited_rp_snvs_exom": limited_rp_snvs_exom,
+        "limited_rp_snvs_nexom": limited_rp_snvs_nexom,
+        "strong_rp_snvs_exom": strong_rp_snvs_exom,
+        "strong_rp_snvs_nexom": strong_rp_snvs_nexom,
+        "mutation_classes": mt_classes,
+        "classes": classes,
+        "VAF": vaf,
+    }
+    json_object = json.dumps(summary, indent=4)
+
+    with open(outpath, "w") as outfile:
+        outfile.write(json_object)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py
new file mode 100644
index 000000000..197240cdf
--- /dev/null
+++ b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+"""Wrapper for summarize information of vcf file after
+somatic variant calling step
+"""
+from snakemake.shell import shell
+
+__author__ = "Pham Gia Cuong"
+__email__ = "pham.gia-cuong@bih-charite.de"
+
+shell(
+    r"""
+# -----------------------------------------------------------------------------
+# Redirect stderr to log file by default and enable printing executed commands
+exec 2> >(tee -a "{snakemake.log.log}")
+set -x
+# -----------------------------------------------------------------------------
+
+# Write out information about conda installation
+conda list > {snakemake.log.conda_list}
+conda info > {snakemake.log.conda_info}
+
+python summarize-vcf.py --rawvcf {snakemake.input.raw_vcf} \
+    --filtered-vcf {snakemake.input.passed_vcf} \
+    --exom-bedfile {snakemake.config[step_config][somatic_variants_QC][target_regions]} \
+    --padding {snakemake.config[step_config][somatic_variants_QC][padding]} \
+    --output {snakemake.output.json}
+
+pushd $(dirname {snakemake.output.json})
+md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5})
+    """
+)
+
+# Compute MD5 sums of logs
+shell(
+    r"""
+md5sum {snakemake.log.log} > {snakemake.log.log_md5}
+md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5}
+md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5}
+"""
+)

From 94cda2782217ad0948962358a3731772c36934ac Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Sun, 25 Jun 2023 20:36:27 +0200
Subject: [PATCH 02/17] Adding somatic variant qc pipeline

---
 snappy_pipeline/apps/snappy_snake.py          |   2 +
 .../workflows/somatic_variant_qc/Snakefile    |  64 +++++
 .../workflows/somatic_variant_qc/__init__.py  | 223 ++++++++++++++++++
 .../tumor_mutational_burden/Snakefile         |   3 +-
 .../wrappers/bcftools/TMB/wrapper.py          |   2 +-
 .../{enviroment.yaml => environment.yaml}     |   0
 .../somatic_variants_qc/summarize-vcf.py      |  84 +++----
 .../wrappers/somatic_variants_qc/wrapper.py   |   8 +-
 8 files changed, 341 insertions(+), 45 deletions(-)
 create mode 100644 snappy_pipeline/workflows/somatic_variant_qc/Snakefile
 create mode 100644 snappy_pipeline/workflows/somatic_variant_qc/__init__.py
 rename snappy_wrappers/wrappers/somatic_variants_qc/{enviroment.yaml => environment.yaml} (100%)

diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
index c47f50b94..9348adb36 100644
--- a/snappy_pipeline/apps/snappy_snake.py
+++ b/snappy_pipeline/apps/snappy_snake.py
@@ -44,6 +44,7 @@
     somatic_variant_expression,
     somatic_variant_filtration,
     somatic_variant_signatures,
+    somatic_variant_qc,
     somatic_wgs_cnv_calling,
     somatic_wgs_sv_calling,
     sv_calling_targeted,
@@ -99,6 +100,7 @@
     "somatic_variant_expression": somatic_variant_expression,
     "somatic_variant_filtration": somatic_variant_filtration,
     "somatic_variant_signatures": somatic_variant_signatures,
+    "somatic_variant_qc": somatic_variant_qc,
     "somatic_wgs_cnv_calling": somatic_wgs_cnv_calling,
     "somatic_wgs_sv_calling": somatic_wgs_sv_calling,
     "sv_calling_targeted": sv_calling_targeted,
diff --git a/snappy_pipeline/workflows/somatic_variant_qc/Snakefile b/snappy_pipeline/workflows/somatic_variant_qc/Snakefile
new file mode 100644
index 000000000..2ac7ab432
--- /dev/null
+++ b/snappy_pipeline/workflows/somatic_variant_qc/Snakefile
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""CUBI Pipeline tumor mutational burden step Snakefile"""
+
+import os
+
+from snappy_pipeline import expand_ref
+from snappy_pipeline.workflows.somatic_variant_qc import (
+    SomaticVariantQCWorkflow,
+)
+
+__author__ = "Gia Cuong Pham"
+__email__ = "pham.gia-cuong@bih-charite.de"
+
+
+# Configuration ===============================================================
+
+
+configfile: "config.yaml"
+
+
+# Expand "$ref" JSON pointers in configuration (also works for YAML)
+config, lookup_paths, config_paths = expand_ref("config.yaml", config)
+
+# WorkflowImpl Object Setup ===================================================
+wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
+
+
+localrules:
+    # Linking files from work/ to output/ should be done locally
+    somatic_variant_qc_link_out_run,
+
+
+rule all:
+    input:
+        wf.get_result_files(),
+
+
+# Generic linking out ---------------------------------------------------------
+
+
+rule somatic_variant_qc_link_out_run:
+    input:
+        wf.get_input_files("link_out", "run"),
+    output:
+        wf.get_output_files("link_out", "run"),
+    run:
+        shell(wf.get_shell_cmd("link_out", "run", wildcards))
+
+
+rule somatic_variant_qc:
+    input:
+        **wf.get_input_files("SVQC_step", "run"),
+    output:
+        **wf.get_output_files("SVQC_step", "run"),
+    threads: wf.get_resource("SVQC_step", "run", "threads")
+    resources:
+        time=wf.get_resource("SVQC_step", "run", "time"),
+        memory=wf.get_resource("SVQC_step", "run", "memory"),
+        partition=wf.get_resource("SVQC_step", "run", "partition"),
+        tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("SVQC_step", "run"),
+    wrapper:
+        wf.wrapper_path("somatic_variants_qc")
diff --git a/snappy_pipeline/workflows/somatic_variant_qc/__init__.py b/snappy_pipeline/workflows/somatic_variant_qc/__init__.py
new file mode 100644
index 000000000..5ce9c4349
--- /dev/null
+++ b/snappy_pipeline/workflows/somatic_variant_qc/__init__.py
@@ -0,0 +1,223 @@
+from collections import OrderedDict
+import os
+import sys
+
+from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
+from snakemake.io import expand
+
+from snappy_pipeline.base import UnsupportedActionException
+from snappy_pipeline.utils import dictify, listify
+from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
+from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage
+from snappy_pipeline.workflows.somatic_variant_calling import (
+    SOMATIC_VARIANT_CALLERS_MATCHED,
+    SomaticVariantCallingWorkflow,
+)
+
+#: Extensions of files to create as main payload
+EXT_VALUES = (".json", ".json.md5")
+
+#: Names of the files to create for the extension
+EXT_NAMES = ("json", "json_md5")
+
+#: Default configuration for the tmb calculation step
+DEFAULT_CONFIG = r"""
+step_config:
+    somatic_variant_qc:
+        path_somatic_variant_calling: ../somatic_variant_calling   # REQUIRED
+        tools_ngs_mapping: []      # default to those configured for ngs_mapping
+        tools_somatic_variant_calling: []  # default to those configured for somatic_variant_calling
+        target_regions: # REQUIRED
+        padding: 0  #Used for count the number of variants outside of exom + padding
+        repeated_regions: [] #need to confirm
+"""
+
+
+class SomaticVariantQCStepPart(BaseStepPart):
+    """"""
+
+    name = "SVQC_step"
+
+    actions = ("run",)
+
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.tumor_ngs_library_to_sample_pair = OrderedDict()
+        for sheet in self.parent.shortcut_sheets:
+            # update function of OrderedDict
+            self.tumor_ngs_library_to_sample_pair.update(
+                sheet.all_sample_pairs_by_tumor_dna_ngs_library
+            )
+        # Build mapping from donor name to donor.
+        self.donors = OrderedDict()
+        for sheet in self.parent.shortcut_sheets:
+            for donor in sheet.donors:
+                self.donors[donor.name] = donor
+
+    @dictify
+    def get_input_files(self, action):
+        self._validate_action(action)
+        tpl = (
+            "output/{mapper}.{var_caller}.{tumor_library}/out/"
+            "{mapper}.{var_caller}.{tumor_library}"
+        )
+        key_ext = {
+            "full_vcf": ".full.vcf.gz",
+            "full_vcf_tbi": ".full.vcf.gz.tbi",
+            "passed_vcf": ".vcf.gz",
+            "passed_vcf_tbi": ".vcf.gz.tbi",
+        }
+        variant_calling = self.parent.sub_workflows["somatic_variant_calling"]
+        for key, ext in key_ext.items():
+            yield key, variant_calling(tpl + ext)
+
+    @dictify
+    def get_output_files(self, action):
+        # Validate action
+        self._validate_action(action)
+        prefix = (
+            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/"
+            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+        )
+        key_ext = {"json": ".json"}
+        for key, ext in key_ext.items():
+            yield key, prefix + ext
+            yield key + "_md5", prefix + ext + ".md5"
+
+    @dictify
+    def _get_log_file(self, action):
+        assert action == "run"
+        prefix = (
+            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/"
+            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+        )
+
+        key_ext = (
+            ("log", ".log"),
+            ("conda_info", ".conda_info.txt"),
+            ("conda_list", ".conda_list.txt"),
+        )
+        for key, ext in key_ext:
+            yield key, prefix + ext
+
+    def get_resource_usage(self, action):
+        """Get Resource Usage
+        :param action: Action (i.e., step) in the workflow, example: 'run'.
+        :type action: str
+        :return: Returns ResourceUsage for step.
+        :raises UnsupportedActionException: if action not in class defined list of valid actions.
+        """
+        if action not in self.actions:
+            actions_str = ", ".join(self.actions)
+            error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
+            raise UnsupportedActionException(error_message)
+        mem_mb = 4 * 1024  # 4GB
+        return ResourceUsage(
+            threads=4,
+            time="1:00:00",  # 1 hour
+            memory=f"{mem_mb}M",
+        )
+
+
+class SomaticVariantQCWorkflow(BaseStep):
+    """Perform gathering variant information"""
+
+    name = "somaticvariantqc"
+    sheet_shortcut_class = CancerCaseSheet
+    sheet_shortcut_kwargs = {
+        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
+    }
+
+    @classmethod
+    def default_config_yaml(cls):
+        """Return default config YAML, to be overwritten by project-specific one."""
+        return DEFAULT_CONFIG
+
+    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
+        super().__init__(
+            workflow,
+            config,
+            config_lookup_paths,
+            config_paths,
+            workdir,
+            (SomaticVariantCallingWorkflow, NgsMappingWorkflow),
+        )
+        # Register sub step classes so the sub steps are available
+        self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart))
+        # Register sub workflows
+        self.register_sub_workflow(
+            "somatic_variant_calling",
+            self.w_config["step_config"]["somatic_variant_qc"]["path_somatic_variant_calling"],
+        )
+        # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
+        if not self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"]:
+            self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"] = self.w_config[
+                "step_config"
+            ]["ngs_mapping"]["tools"]["dna"]
+        if not self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]:
+            self.w_config["step_config"]["somatic_variant_qc"][
+                "tools_somatic_variant_calling"
+            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
+
+    @listify
+    def get_result_files(self):
+        callers = set(
+            self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]
+        )
+        name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}"
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"],
+            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            ext=EXT_VALUES,
+        )
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"],
+            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            ext=(
+                ".log",
+                ".log.md5",
+                ".conda_info.txt",
+                ".conda_info.txt.md5",
+                ".conda_list.txt",
+                ".conda_list.txt.md5",
+            ),
+        )
+
+    def _yield_result_files_matched(self, tpl, **kwargs):
+        """Build output paths from path template and extension list.
+
+        This function returns the results from the matched somatic variant callers such as
+        Mutect.
+        """
+        for sheet in filter(is_not_background, self.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if (
+                    not sample_pair.tumor_sample.dna_ngs_library
+                    or not sample_pair.normal_sample.dna_ngs_library
+                ):
+                    msg = (
+                        "INFO: sample pair for cancer bio sample {} has is missing primary"
+                        "normal or primary cancer NGS library"
+                    )
+                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
+                    continue
+                yield from expand(
+                    tpl,
+                    tumor_library=[sample_pair.tumor_sample.dna_ngs_library],
+                    **kwargs,
+                )
+
+    def check_config(self):
+        """Check that the path to the NGS mapping is present"""
+        self.ensure_w_config(
+            ("step_config", "somatic_variant_qc", "path_somatic_variant_calling"),
+            "Path to variant calling not configured but required for somatic variant qc",
+        )
+
+        self.ensure_w_config(
+            ("step_config", "somatic_variant_qc", "target_regions"),
+            "Path to target_regions file (bed format)"
+            "not configured but required for somatic variant qc",
+        )
diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile
index 27da7d775..3b3a2ee2b 100644
--- a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile
+++ b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile
@@ -8,7 +8,8 @@ from snappy_pipeline.workflows.tumor_mutational_burden import (
     TumorMutationalBurdenCalculationWorkflow,
 )
 
-__author__ = ""
+__author__ = "Gia Cuong Pham"
+__email__ = "pham.gia-cuong@bih-charite.de"
 
 
 # Configuration ===============================================================
diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
index 6bd7a88ba..e06829437 100644
--- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
@@ -46,10 +46,10 @@
 "Number_snvs": $number_snvs,
 "Number_indels": $number_indels,
 "Total_regions_length": $total_exom_length
+}
 EOF
 pushd $(dirname {snakemake.output.json})
 md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5})
-}
 """
 )
 
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml b/snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml
similarity index 100%
rename from snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml
rename to snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
index fb8ad450f..19a3e9087 100644
--- a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
@@ -59,6 +59,10 @@
 ## PIPELINE
 # - __init__ :
 
+CONTIG = ["chr" + str(i) for i in range(1, 22)]
+CONTIG.append("chrX")
+CONTIG.append("chrY")
+
 
 def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0):
     # print(f'variant chrom: {v_chrom} - start: {v_start} - end:{v_end}')
@@ -70,7 +74,7 @@ def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0):
 
 
 # def calculate_exoms_length(bed_intervals):
-#     contigs= ["chr" + str(i) for i in range(1, 23)]
+#     contigs= ["chr" + str(i) for i in range(1, 22)]
 #     contigs.append("chrX")
 #     contigs.append("chrY")
 #     length = 0
@@ -140,45 +144,46 @@ def process_vcf_file(vcf_file, bed_file="", filter_file=False, padding=0):
         # mutation classes
         mt_classes = [0] * 6
         for variant in vcf_file:
-            vaf.append(variant.format("AF")[1])
-            total_v_count += 1
-            # Counting variants in or out of exom and variants with different support reads
-            if "chrUn" in variant.CHROM:
-                v_outside_exom += 1
-                if check_sp_read(variant) == "minimal":
-                    minimal_rp_snvs_nexom += 1
-                elif check_sp_read(variant) == "limited":
-                    limited_rp_snvs_nexom += 1
-                else:
-                    strong_rp_snvs_nexom += 1
-            elif check_variant_in_bed(
-                variant.CHROM, variant.start, variant.end, bed_file, padding=0
-            ):
-                v_inside_exom += 1
-                if check_sp_read(variant) == "minimal":
-                    minimal_rp_snvs_exom += 1
-                elif check_sp_read(variant) == "limited":
-                    limited_rp_snvs_exom += 1
-                else:
-                    strong_rp_snvs_exom += 1
+            if variant.CHROM in CONTIG:
+                vaf.append(float(variant.format("AF")[1][0]))
+                total_v_count += 1
+                # Counting variants in or out of exom and variants with different support reads
+                if "chrUn" in variant.CHROM:
+                    v_outside_exom += 1
+                    if check_sp_read(variant) == "minimal":
+                        minimal_rp_snvs_nexom += 1
+                    elif check_sp_read(variant) == "limited":
+                        limited_rp_snvs_nexom += 1
+                    else:
+                        strong_rp_snvs_nexom += 1
+                elif check_variant_in_bed(
+                    variant.CHROM, variant.start, variant.end, bed_file, padding=0
+                ):
+                    v_inside_exom += 1
+                    if check_sp_read(variant) == "minimal":
+                        minimal_rp_snvs_exom += 1
+                    elif check_sp_read(variant) == "limited":
+                        limited_rp_snvs_exom += 1
+                    else:
+                        strong_rp_snvs_exom += 1
 
-            else:
-                v_outside_exom += 1
-                if check_sp_read(variant) == "minimal":
-                    minimal_rp_snvs_nexom += 1
-                elif check_sp_read(variant) == "limited":
-                    limited_rp_snvs_nexom += 1
                 else:
-                    strong_rp_snvs_nexom += 1
-            # get number of snvs and indels
-            # Need to check multi allelic. Users shouldn't input multi allelic vcf file.
-            if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
-                n_snps += 1
-                mt_classes = assign_class_snvs(variant, mt_classes)
-            else:
-                # More for indels
-                n_indels += 1
-                indels_length.append(abs(len(variant.REF) - len(variant.ALT[0])))
+                    v_outside_exom += 1
+                    if check_sp_read(variant) == "minimal":
+                        minimal_rp_snvs_nexom += 1
+                    elif check_sp_read(variant) == "limited":
+                        limited_rp_snvs_nexom += 1
+                    else:
+                        strong_rp_snvs_nexom += 1
+                # get number of snvs and indels
+                # Need to check multi allelic. Users shouldn't input multi allelic vcf file.
+                if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
+                    n_snps += 1
+                    mt_classes = assign_class_snvs(variant, mt_classes)
+                else:
+                    # More for indels
+                    n_indels += 1
+                    indels_length.append(abs(len(variant.REF) - len(variant.ALT[0])))
 
         return (
             total_v_count,
@@ -231,7 +236,6 @@ def main():
         mt_classes,
         vaf,
     ) = process_vcf_file(filter_vcf, bed_intervals, filter_file=True, padding=options.padding)
-
     # After one for loop through vcf file. The VCF will automatically be closed.
     # length_exoms = calculate_exoms_length(bed_intervals)
     # for variant in filter_vcf:
@@ -255,7 +259,7 @@ def main():
         "classes": classes,
         "VAF": vaf,
     }
-    json_object = json.dumps(summary, indent=4)
+    json_object = json.dumps(summary)
 
     with open(outpath, "w") as outfile:
         outfile.write(json_object)
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py
index 197240cdf..81dd9fa87 100644
--- a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py
@@ -2,11 +2,13 @@
 """Wrapper for summarize information of vcf file after
 somatic variant calling step
 """
+import os
 from snakemake.shell import shell
 
 __author__ = "Pham Gia Cuong"
 __email__ = "pham.gia-cuong@bih-charite.de"
 
+base_dir = os.path.dirname(os.path.realpath(__file__))
 shell(
     r"""
 # -----------------------------------------------------------------------------
@@ -19,10 +21,10 @@
 conda list > {snakemake.log.conda_list}
 conda info > {snakemake.log.conda_info}
 
-python summarize-vcf.py --rawvcf {snakemake.input.raw_vcf} \
+python {base_dir}/summarize-vcf.py --rawvcf {snakemake.input.full_vcf} \
     --filtered-vcf {snakemake.input.passed_vcf} \
-    --exom-bedfile {snakemake.config[step_config][somatic_variants_QC][target_regions]} \
-    --padding {snakemake.config[step_config][somatic_variants_QC][padding]} \
+    --exom-bedfile {snakemake.config[step_config][somatic_variant_qc][target_regions]} \
+    --padding {snakemake.config[step_config][somatic_variant_qc][padding]} \
     --output {snakemake.output.json}
 
 pushd $(dirname {snakemake.output.json})

From bd7b4113a93a63242502ead8d05544658a201f6b Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Fri, 30 Jun 2023 20:33:03 +0200
Subject: [PATCH 03/17] Update somatic_variant_checking

---
 snappy_pipeline/apps/snappy_snake.py          |   2 -
 .../somatic_variant_checking/Snakefile        |  64 +++++
 .../somatic_variant_checking/__init__.py      | 259 +++++++++++++----
 .../wrappers/bcftools/TMB/wrapper.py          |   4 +
 .../environment.yaml                          |   0
 .../summarize-vcf.py                          | 254 +++++++++++++++++
 .../wrapper.py                                |   7 +-
 .../somatic_variants_qc/summarize-vcf.py      | 269 ------------------
 8 files changed, 538 insertions(+), 321 deletions(-)
 create mode 100644 snappy_pipeline/workflows/somatic_variant_checking/Snakefile
 rename snappy_wrappers/wrappers/{somatic_variants_qc => somatic_variants_checking}/environment.yaml (100%)
 create mode 100644 snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
 rename snappy_wrappers/wrappers/{somatic_variants_qc => somatic_variants_checking}/wrapper.py (76%)
 delete mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py

diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
index 9348adb36..c47f50b94 100644
--- a/snappy_pipeline/apps/snappy_snake.py
+++ b/snappy_pipeline/apps/snappy_snake.py
@@ -44,7 +44,6 @@
     somatic_variant_expression,
     somatic_variant_filtration,
     somatic_variant_signatures,
-    somatic_variant_qc,
     somatic_wgs_cnv_calling,
     somatic_wgs_sv_calling,
     sv_calling_targeted,
@@ -100,7 +99,6 @@
     "somatic_variant_expression": somatic_variant_expression,
     "somatic_variant_filtration": somatic_variant_filtration,
     "somatic_variant_signatures": somatic_variant_signatures,
-    "somatic_variant_qc": somatic_variant_qc,
     "somatic_wgs_cnv_calling": somatic_wgs_cnv_calling,
     "somatic_wgs_sv_calling": somatic_wgs_sv_calling,
     "sv_calling_targeted": sv_calling_targeted,
diff --git a/snappy_pipeline/workflows/somatic_variant_checking/Snakefile b/snappy_pipeline/workflows/somatic_variant_checking/Snakefile
new file mode 100644
index 000000000..b73514711
--- /dev/null
+++ b/snappy_pipeline/workflows/somatic_variant_checking/Snakefile
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""CUBI Pipeline tumor mutational burden step Snakefile"""
+
+import os
+
+from snappy_pipeline import expand_ref
+from snappy_pipeline.workflows.somatic_variant_checking import (
+    SomaticVariantQCWorkflow,
+)
+
+__author__ = "Gia Cuong Pham"
+__email__ = "pham.gia-cuong@bih-charite.de"
+
+
+# Configuration ===============================================================
+
+
+configfile: "config.yaml"
+
+
+# Expand "$ref" JSON pointers in configuration (also works for YAML)
+config, lookup_paths, config_paths = expand_ref("config.yaml", config)
+
+# WorkflowImpl Object Setup ===================================================
+wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
+
+
+localrules:
+    # Linking files from work/ to output/ should be done locally
+    somatic_variant_qc_link_out_run,
+
+
+rule all:
+    input:
+        wf.get_result_files(),
+
+
+# Generic linking out ---------------------------------------------------------
+
+
+rule somatic_variant_qc_link_out_run:
+    input:
+        wf.get_input_files("link_out", "run"),
+    output:
+        wf.get_output_files("link_out", "run"),
+    run:
+        shell(wf.get_shell_cmd("link_out", "run", wildcards))
+
+
+rule somatic_variant_qc:
+    input:
+        **wf.get_input_files("SVQC_step", "run"),
+    output:
+        **wf.get_output_files("SVQC_step", "run"),
+    threads: wf.get_resource("SVQC_step", "run", "threads")
+    resources:
+        time=wf.get_resource("SVQC_step", "run", "time"),
+        memory=wf.get_resource("SVQC_step", "run", "memory"),
+        partition=wf.get_resource("SVQC_step", "run", "partition"),
+        tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("SVQC_step", "run"),
+    wrapper:
+        wf.wrapper_path("somatic_variants_checking")
diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
index aea5ebf23..5c1367333 100644
--- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
+++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
@@ -1,56 +1,219 @@
-# -*- coding: utf-8 -*-
-"""Implementation of the germline ``somatic_variant_checking`` step
+import os
+import sys
 
-The ``somatic_variant_checking`` step takes as the input the results of the
-``somatic_variant_annotation`` step.  It then executes various tools computing statistics on the
-result files and consistency checks with the pedigrees.
+from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
+from snakemake.io import expand
 
-.. note::
+from snappy_pipeline.base import UnsupportedActionException
+from snappy_pipeline.utils import dictify, listify
+from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
+from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage
+from snappy_pipeline.workflows.somatic_variant_calling import (
+    SOMATIC_VARIANT_CALLERS_MATCHED,
+    SomaticVariantCallingWorkflow,
+)
 
-    Status: not implemented yet
+#: Extensions of files to create as main payload
+EXT_VALUES = (".json", ".json.md5")
 
-==========
-Step Input
-==========
+#: Names of the files to create for the extension
+EXT_NAMES = ("json", "json_md5")
 
-The variant calling step uses Snakemake sub workflows for using the result of the
-``somatic_variant_annotation`` step.
-
-===========
-Step Output
-===========
-
-.. note:: TODO
-
-====================
-Global Configuration
-====================
-
-.. note:: TODO
-
-=====================
-Default Configuration
-=====================
-
-The default configuration is as follows.
-
-.. include:: DEFAULT_CONFIG_somatic_variant_checking.rst
-
-=======
-Reports
-=======
-
-Currently, no reports are generated.
-"""
-
-__author__ = "Manuel Holtgrewe <manuel.holtgrewe@bih-charite.de>"
-
-#: Available tools for checking variants
-VARIANT_CHECKERS = ("bcftools_stats", "peddy")
-
-#: Default configuration for the somatic_gene_fusion_calling step
+#: Default configuration for the tmb calculation step
 DEFAULT_CONFIG = r"""
 step_config:
-  somatic_variant_checking:
-    path_somatic_variant_calling: ../somatic_variant_calling  # REQUIRED
+    somatic_variant_checking:
+        path_somatic_variant_calling: ../somatic_variant_calling   # REQUIRED
+        tools_ngs_mapping: []      # default to those configured for ngs_mapping
+        tools_somatic_variant_calling: []  # default to those configured for somatic_variant_calling
+        target_regions: # REQUIRED
+        padding: 0  #Used for count the number of variants outside of exom + padding
+        ignore_regions: []
+        minimal_support_read: 1
+        limited_support_read: 5
 """
+
+
+class SomaticVariantQCStepPart(BaseStepPart):
+    """"""
+
+    name = "SVQC_step"
+
+    actions = ("run",)
+
+    def __init__(self, parent):
+        super().__init__(parent)
+
+    @dictify
+    def get_input_files(self, action):
+        self._validate_action(action)
+        tpl = (
+            "output/{mapper}.{var_caller}.{tumor_library}/out/"
+            "{mapper}.{var_caller}.{tumor_library}"
+        )
+        key_ext = {
+            "full_vcf": ".full.vcf.gz",
+            "full_vcf_tbi": ".full.vcf.gz.tbi",
+            "passed_vcf": ".vcf.gz",
+            "passed_vcf_tbi": ".vcf.gz.tbi",
+        }
+        variant_calling = self.parent.sub_workflows["somatic_variant_calling"]
+        for key, ext in key_ext.items():
+            yield key, variant_calling(tpl + ext)
+
+    @dictify
+    def get_output_files(self, action):
+        # Validate action
+        self._validate_action(action)
+        prefix = (
+            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/"
+            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+        )
+        key_ext = {"json": ".json"}
+        for key, ext in key_ext.items():
+            yield key, prefix + ext
+            yield key + "_md5", prefix + ext + ".md5"
+
+    @dictify
+    def _get_log_file(self, action):
+        assert action == "run"
+        prefix = (
+            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/"
+            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+        )
+
+        key_ext = (
+            ("log", ".log"),
+            ("conda_info", ".conda_info.txt"),
+            ("conda_list", ".conda_list.txt"),
+        )
+        for key, ext in key_ext:
+            yield key, prefix + ext
+
+    def get_resource_usage(self, action):
+        """Get Resource Usage
+        :param action: Action (i.e., step) in the workflow, example: 'run'.
+        :type action: str
+        :return: Returns ResourceUsage for step.
+        :raises UnsupportedActionException: if action not in class defined list of valid actions.
+        """
+        if action not in self.actions:
+            actions_str = ", ".join(self.actions)
+            error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
+            raise UnsupportedActionException(error_message)
+        mem_mb = 4 * 1024  # 4GB
+        return ResourceUsage(
+            threads=2,
+            time="1:00:00",  # 1 hour
+            memory=f"{mem_mb}M",
+        )
+
+
+class SomaticVariantQCWorkflow(BaseStep):
+    """Perform gathering variant information"""
+
+    name = "somaticvariantchecking"
+    sheet_shortcut_class = CancerCaseSheet
+    sheet_shortcut_kwargs = {
+        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
+    }
+
+    @classmethod
+    def default_config_yaml(cls):
+        """Return default config YAML, to be overwritten by project-specific one."""
+        return DEFAULT_CONFIG
+
+    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
+        super().__init__(
+            workflow,
+            config,
+            config_lookup_paths,
+            config_paths,
+            workdir,
+            (SomaticVariantCallingWorkflow, NgsMappingWorkflow),
+        )
+        # Register sub step classes so the sub steps are available
+        self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart))
+        # Register sub workflows
+        self.register_sub_workflow(
+            "somatic_variant_calling",
+            self.w_config["step_config"]["somatic_variant_checking"][
+                "path_somatic_variant_calling"
+            ],
+        )
+        # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
+        if not self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"]:
+            self.w_config["step_config"]["somatic_variant_checking"][
+                "tools_ngs_mapping"
+            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]
+        if not self.w_config["step_config"]["somatic_variant_checking"][
+            "tools_somatic_variant_calling"
+        ]:
+            self.w_config["step_config"]["somatic_variant_checking"][
+                "tools_somatic_variant_calling"
+            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
+
+    @listify
+    def get_result_files(self):
+        callers = set(
+            self.w_config["step_config"]["somatic_variant_checking"][
+                "tools_somatic_variant_calling"
+            ]
+        )
+        name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}"
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"],
+            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            ext=EXT_VALUES,
+        )
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"],
+            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            ext=(
+                ".log",
+                ".log.md5",
+                ".conda_info.txt",
+                ".conda_info.txt.md5",
+                ".conda_list.txt",
+                ".conda_list.txt.md5",
+            ),
+        )
+
+    def _yield_result_files_matched(self, tpl, **kwargs):
+        """Build output paths from path template and extension list.
+
+        This function returns the results from the matched somatic variant callers such as
+        Mutect.
+        """
+        for sheet in filter(is_not_background, self.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if (
+                    not sample_pair.tumor_sample.dna_ngs_library
+                    or not sample_pair.normal_sample.dna_ngs_library
+                ):
+                    msg = (
+                        "INFO: sample pair for cancer bio sample {} has is missing primary"
+                        "normal or primary cancer NGS library"
+                    )
+                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
+                    continue
+                yield from expand(
+                    tpl,
+                    tumor_library=[sample_pair.tumor_sample.dna_ngs_library],
+                    **kwargs,
+                )
+
+    def check_config(self):
+        """Check that the path to the NGS mapping is present"""
+        self.ensure_w_config(
+            ("step_config", "somatic_variant_checking", "path_somatic_variant_calling"),
+            "Path to variant calling not configured but required for somatic variant qc",
+        )
+
+        self.ensure_w_config(
+            ("step_config", "somatic_variant_checking", "target_regions"),
+            "Path to target_regions file (bed format)"
+            "not configured but required for somatic variant qc",
+        )
diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
index e06829437..1408cff2a 100644
--- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
@@ -26,6 +26,10 @@
 name_vcf=$(basename {snakemake.input.vcf})
 vcf_md5=$(md5sum {snakemake.input.vcf} | awk '{{print $1}}')
 
+cat_cmd=zcat    #in case that bed file has gzip format
+gzip -t $bed_file || cat_cmd=cat
+total_exom_length=$($cat_cmd $bed_file | \
+    awk '{{dis+=$3-$2}} END {{print dis}}')
 total_exom_length=$(zcat $bed_file | \
     awk '{{dis+=$3-$2}} END {{print dis}}') #TMB_rounded=`printf "%.3f" $TMB`
 
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml
similarity index 100%
rename from snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml
rename to snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
new file mode 100644
index 000000000..09b8f9bfa
--- /dev/null
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -0,0 +1,254 @@
+# import packages
+
+import json  # for writing out the json file
+from cyvcf2 import VCF  # for reading vcf file
+import gzip
+import tabix
+import argparse
+
+parser = argparse.ArgumentParser(description="Gathering information of variants")
+parser.add_argument(
+    "--rawvcf",
+    help="raw vcf file",
+)
+parser.add_argument(
+    "--filtered-vcf",
+    help="filtered vcf file",
+)
+parser.add_argument(
+    "--exom-bedfile",
+    help="exom regions bed file",
+)
+parser.add_argument("--padding", help="it's padding", default=0)
+parser.add_argument(
+    "--ignore-regions",
+    help="bed file contains repeated or difficult to map regions",
+)
+parser.add_argument(
+    "-m",
+    "--minimal",
+    type=int,
+    default=1,
+    help="threshold for defining a variant that has minimal support reads",
+)
+parser.add_argument(
+    "-l",
+    "--limited",
+    type=int,
+    default=5,
+    help="threshold for defining a variant that has limited support reads",
+)
+parser.add_argument(
+    "--output",
+    help="json file contains information about vcf file",
+)
+args = parser.parse_args()
+
+
+def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0):
+    try:
+        tmp = []
+        records = bed_intervals.query(v_chrom, v_start - padding, v_end + padding)
+        for record in records:
+            tmp.append(record)
+        if len(tmp) >= 1:
+            return True
+        else:
+            return False
+    except OSError:
+        print("You should index the bed file with tabix first!")
+
+
+def get_contigs_from_bed_file(bedfile):
+    contigs = []
+    try:
+        with gzip.open(bedfile, "r") as intervals:
+            for interval in intervals:
+                line = interval.decode()
+                tmp = line.split("\t")[0]
+                if tmp not in contigs:
+                    contigs.append(tmp)
+        return contigs
+    except OSError:
+        print("Error reading the GZIP file.")
+
+
+def get_variant_type(ref, alt):
+    if (len(alt) == 1) and (len(ref) == len(alt)):
+        return "snp"
+    elif len(alt) != len(ref):
+        return "indels"
+
+
+def check_sp_read(variant, minimal, limited):
+    dp = variant.INFO["DP"]
+    if dp <= minimal:
+        return "minimal"
+    elif (dp > minimal) and (dp <= limited):
+        return "limited"
+    elif dp >= limited:
+        return "strong"
+
+
+def assign_class_snvs(variant, mt_mat):
+    temp = str(variant.REF) + ">" + str(variant.ALT[0])
+    # Following result from plot-vcfstats with bcftools
+    if temp in ["A>C", "T>G"]:
+        mt_mat[0] += 1
+    elif temp in ["A>G", "T>C"]:
+        mt_mat[1] += 1
+    elif temp in ["A>T", "T>A"]:
+        mt_mat[2] += 1
+    elif temp in ["C>G", "G>C"]:
+        mt_mat[3] += 1
+    elif temp in ["C>T", "G>A"]:
+        mt_mat[4] += 1
+    elif temp in ["G>T", "C>A"]:
+        mt_mat[5] += 1
+    return mt_mat
+
+
+def process_vcf_file(
+    vcf_file,
+    contigs=[],
+    hard_contigs=[],
+    minimal=1,
+    limited=5,
+    bed_file="",
+    hard_file="",
+    filter_file=False,
+    padding=0,
+):
+    infor = {
+        "total_v_count": 0,
+        "v_inside_exom": 0,
+        "v_outside_exom": 0,
+        "v_in_hard_regions": 0,
+        "n_snps": 0,
+        "n_indels": 0,
+        "minimal_rp_snvs_exom": 0,
+        "minimal_rp_snvs_nexom": 0,
+        "limited_rp_snvs_exom": 0,
+        "limited_rp_snvs_nexom": 0,
+        "strong_rp_snvs_exom": 0,
+        "strong_rp_snvs_nexom": 0,
+        "indels_length": [],
+        "mt_classes": [0] * 6,
+        "vaf": [],
+    }
+    if filter_file:
+        for variant in vcf_file:
+            infor["total_v_count"] += 1
+            # Gathering information of variants in comparison to interested regions
+            if variant.CHROM in contigs:
+                if len(variant.ALT) > 1:
+                    raise ValueError(
+                        "Multiallelic variants are not supported"
+                    )  # checking for multiallelic
+                infor["vaf"].append(float(variant.format("AF")[1][0]))
+                # Counting variants in or out of exom and variants with different support reads
+                if check_variant_in_bed(
+                    variant.CHROM, variant.start, variant.end, bed_file, padding=0
+                ):
+                    infor["v_inside_exom"] += 1
+                    infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_exom"] += 1
+                else:
+                    infor["v_outside_exom"] += 1
+                    infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_nexom"] += 1
+
+                # Need to check multi allelic. Users shouldn't input multi allelic vcf file.
+                if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
+                    infor["n_snps"] += 1
+                    infor["mt_classes"] = assign_class_snvs(variant, infor["mt_classes"])
+                else:
+                    # More for indels
+                    infor["n_indels"] += 1
+                    infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0])))
+            # Gathering information of variants in comparison to hard mapped regions
+
+            if variant.CHROM in hard_contigs:
+                if check_variant_in_bed(
+                    variant.CHROM, variant.start, variant.end, hard_file, padding=0
+                ):
+                    infor["v_in_hard_regions"] += 1
+        return infor
+    else:
+        total_v_count = 0
+        for variant in vcf_file:
+            total_v_count += 1
+        return total_v_count
+
+
+def main():
+    classes = ["T>G", "T>C", "T>A", "C>G", "C>T", "C>A"]
+    # reading input
+    full_vcf = VCF(args.rawvcf)
+    filter_vcf = VCF(args.filtered_vcf)
+    sample = filter_vcf.samples[1]
+    ######################
+    bed_intervals = tabix.open(args.exom_bedfile)
+    contigs = get_contigs_from_bed_file(args.exom_bedfile)
+    full_v_count = process_vcf_file(full_vcf)
+    # read bed file
+    if args.ignore_regions:
+        path_hard_regions = args.ignore_regions
+        hard_intervals = tabix.open(path_hard_regions)
+        hard_contigs = get_contigs_from_bed_file(path_hard_regions)
+        infor = process_vcf_file(
+            filter_vcf,
+            contigs,
+            hard_contigs,
+            args.minimal,
+            args.limited,
+            bed_intervals,
+            hard_intervals,
+            filter_file=True,
+            padding=args.padding,
+        )
+    else:
+        infor = process_vcf_file(
+            filter_vcf,
+            contigs,
+            [],
+            args.minimal,
+            args.limited,
+            bed_intervals,
+            "",
+            filter_file=True,
+            padding=args.padding,
+        )
+
+    summary = {
+        "sample": sample,
+        "number_full_variants": full_v_count,
+        "number_passed_variants": infor["total_v_count"],
+        "number_snvs_inside_exom": infor["v_inside_exom"],
+        "number_snvs_outside_exom": infor["v_outside_exom"],
+        "number_snvs_in_hard_regions": infor["v_in_hard_regions"],
+        "number_snps": infor["n_snps"],
+        "number_indels": infor["n_indels"],
+        "minimal_rp_snvs_exom": infor["minimal_rp_snvs_exom"],
+        "minimal_rp_snvs_nexom": infor["minimal_rp_snvs_nexom"],
+        "limited_rp_snvs_exom": infor["limited_rp_snvs_exom"],
+        "limited_rp_snvs_nexom": infor["limited_rp_snvs_nexom"],
+        "strong_rp_snvs_exom": infor["strong_rp_snvs_exom"],
+        "strong_rp_snvs_nexom": infor["strong_rp_snvs_nexom"],
+        classes[0]: infor["mt_classes"][0],
+        classes[1]: infor["mt_classes"][1],
+        classes[2]: infor["mt_classes"][2],
+        classes[3]: infor["mt_classes"][3],
+        classes[4]: infor["mt_classes"][4],
+        classes[5]: infor["mt_classes"][5],
+        "indels_length": infor["indels_length"],
+        "VAF": infor["vaf"],
+    }
+    if not args.ignore_regions:
+        del summary["number_snvs_in_hard_regions"]
+    json_object = json.dumps(summary)
+
+    with open(args.output, "w") as outfile:
+        outfile.write(json_object)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
similarity index 76%
rename from snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py
rename to snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
index 81dd9fa87..c7b0239cc 100644
--- a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
@@ -23,8 +23,11 @@
 
 python {base_dir}/summarize-vcf.py --rawvcf {snakemake.input.full_vcf} \
     --filtered-vcf {snakemake.input.passed_vcf} \
-    --exom-bedfile {snakemake.config[step_config][somatic_variant_qc][target_regions]} \
-    --padding {snakemake.config[step_config][somatic_variant_qc][padding]} \
+    --exom-bedfile {snakemake.config[step_config][somatic_variant_checking][target_regions]} \
+    --padding {snakemake.config[step_config][somatic_variant_checking][padding]} \
+    --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \
+    --limited {snakemake.config[step_config][somatic_variant_checking][limited_support_read]} \
+    --ignore-regions {snakemake.config[step_config][somatic_variant_checking][ignore_regions]} \
     --output {snakemake.output.json}
 
 pushd $(dirname {snakemake.output.json})
diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
deleted file mode 100644
index 19a3e9087..000000000
--- a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# import packages
-
-import json  # for writing out the json file
-from cyvcf2 import VCF  # for reading vcf file
-import tabix
-from optparse import OptionParser
-
-
-parser = OptionParser()
-parser.add_option("--rawvcf", help="raw vcf file")
-parser.add_option("--filtered-vcf", help="filtered vcf file")
-parser.add_option(
-    "--exom-bedfile",
-    help="exom regions bed file",
-)
-parser.add_option("--padding", help="it's padding", default=0)
-parser.add_option(
-    "--repeated-region-bedfile",
-    help="bed file contains repeated or difficult to map regions",
-)
-parser.add_option(
-    "--output",
-    help="json file contains information about vcf file",
-)
-(options, args) = parser.parse_args()
-
-# from pybedtools import Interval,BedTool
-
-# I will use for loop here
-# It could be better by using multi threading.
-# TODO:
-# - Number of variants called (from the `*.full.vcf.gz`file) [X] Get all variants from full file
-# - Number of variants that pass all filters [X] Get all variants from fitlered file
-# - Number of variants passing the filters that are inside & outside of the exome bed file (with padding) [X]
-# - Number of variants in repeated or other difficult to map regions [ ] Need a bed file containing repeated
-# - Number of SNVs & indels [X]
-# - Indel lengths statistics [X]
-# - Number of variants with more than 1 ALT allele (I _think_ that this value should be 0 for all variants passing the filters) [ ]
-# - Number of SNVs with minimal support (only one read supporting the variant), possibly separating inside & outside exome bed [X]
-# - Number of SNVs with limited support (5 reads or less supporting the variant), possibly separating inside & outside exome bed [X]
-# - Number of SNVs with strong support (at least 10 reads, VAF greater or equal to 10%) [X] VAF need to be joined
-# - Number of SNVs in each mutation class (C>A, C>G, C>T, T>A, T>C, T>G), for minimal, limited, average & strong support [X] not for minimal,limited,average yet
-# - Metrics on strand bias could also be collected (F1R2 & F2R1 vcf FORMAT) [ ] PAUSE
-# - the VAF distribution. Perhaps we can split the variants (both SNVs & indels) into those with very little support (<1%), subclonal (<10%), the main part (<40%), affected by CNV (>40%)[ ]
-
-
-# Adding A>C G>T grouping
-# Table for VAF
-# Genotype check > report different genotypes
-# Checking number multi allelic
-# Support alternative allel in the normal
-# #alt in normal + average ALT + max ALT
-# snappy pipeline for this
-
-##### SNAPPY PIPELINE
-## WRAPPER
-# - Checking the configuration file
-# - Checking input file
-## PIPELINE
-# - __init__ :
-
-CONTIG = ["chr" + str(i) for i in range(1, 22)]
-CONTIG.append("chrX")
-CONTIG.append("chrY")
-
-
-def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0):
-    # print(f'variant chrom: {v_chrom} - start: {v_start} - end:{v_end}')
-    record = bed_intervals.query(v_chrom, v_start - padding, v_end + padding)
-    if record:
-        return True
-    else:
-        return False
-
-
-# def calculate_exoms_length(bed_intervals):
-#     contigs= ["chr" + str(i) for i in range(1, 22)]
-#     contigs.append("chrX")
-#     contigs.append("chrY")
-#     length = 0
-#     for contig in contigs:
-#         records = bed_intervals.query(contig,0,250000000)
-#         for record in records:
-#             length = length + int(record[2]) - int(record[1])
-#     return length
-
-
-def get_variant_type(ref, alt):
-    if (len(alt) == 1) and (len(ref) == len(alt)):
-        return "snp"
-    elif len(alt) != len(ref):
-        return "indels"
-
-
-def check_sp_read(variant):
-    if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
-        dp = variant.INFO["DP"]
-        if dp == 1:
-            return "minimal"
-        elif (dp > 1) and (dp <= 5):
-            return "limited"
-        if dp >= 10:
-            return "strong"
-
-
-def assign_class_snvs(variant, mt_mat):
-    temp = str(variant.REF) + ">" + str(variant.ALT[0])
-    # Following result from plot-vcfstats with bcftools
-    match temp:
-        case "A>C" | "C>A":
-            mt_mat[0] += 1
-        case "A>G" | "G>A":
-            mt_mat[1] += 1
-        case "A>T" | "T>A":
-            mt_mat[2] += 1
-        case "C>G" | "G>C":
-            mt_mat[3] += 1
-        case "C>T" | "T>C":
-            mt_mat[4] += 1
-        case "G>T" | "T>G":
-            mt_mat[5] += 1
-    return mt_mat
-
-
-def process_vcf_file(vcf_file, bed_file="", filter_file=False, padding=0):
-    total_v_count = 0
-    if filter_file:
-        # VAF
-        vaf = []
-        # numb of variants in and out of exoms
-        v_inside_exom = 0
-        v_outside_exom = 0
-        # numb of snps and indels
-        n_snps = 0
-        n_indels = 0
-        indels_length = []
-        # support read informations
-        minimal_rp_snvs_exom = 0
-        minimal_rp_snvs_nexom = 0
-        limited_rp_snvs_exom = 0
-        limited_rp_snvs_nexom = 0
-        strong_rp_snvs_exom = 0
-        strong_rp_snvs_nexom = 0
-        # mutation classes
-        mt_classes = [0] * 6
-        for variant in vcf_file:
-            if variant.CHROM in CONTIG:
-                vaf.append(float(variant.format("AF")[1][0]))
-                total_v_count += 1
-                # Counting variants in or out of exom and variants with different support reads
-                if "chrUn" in variant.CHROM:
-                    v_outside_exom += 1
-                    if check_sp_read(variant) == "minimal":
-                        minimal_rp_snvs_nexom += 1
-                    elif check_sp_read(variant) == "limited":
-                        limited_rp_snvs_nexom += 1
-                    else:
-                        strong_rp_snvs_nexom += 1
-                elif check_variant_in_bed(
-                    variant.CHROM, variant.start, variant.end, bed_file, padding=0
-                ):
-                    v_inside_exom += 1
-                    if check_sp_read(variant) == "minimal":
-                        minimal_rp_snvs_exom += 1
-                    elif check_sp_read(variant) == "limited":
-                        limited_rp_snvs_exom += 1
-                    else:
-                        strong_rp_snvs_exom += 1
-
-                else:
-                    v_outside_exom += 1
-                    if check_sp_read(variant) == "minimal":
-                        minimal_rp_snvs_nexom += 1
-                    elif check_sp_read(variant) == "limited":
-                        limited_rp_snvs_nexom += 1
-                    else:
-                        strong_rp_snvs_nexom += 1
-                # get number of snvs and indels
-                # Need to check multi allelic. Users shouldn't input multi allelic vcf file.
-                if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
-                    n_snps += 1
-                    mt_classes = assign_class_snvs(variant, mt_classes)
-                else:
-                    # More for indels
-                    n_indels += 1
-                    indels_length.append(abs(len(variant.REF) - len(variant.ALT[0])))
-
-        return (
-            total_v_count,
-            v_inside_exom,
-            v_outside_exom,
-            n_snps,
-            n_indels,
-            indels_length,
-            minimal_rp_snvs_exom,
-            minimal_rp_snvs_nexom,
-            limited_rp_snvs_exom,
-            limited_rp_snvs_nexom,
-            strong_rp_snvs_exom,
-            strong_rp_snvs_nexom,
-            mt_classes,
-            vaf,
-        )
-    else:
-        for variant in vcf_file:
-            total_v_count += 1
-        return total_v_count
-
-
-def main():
-    classes = ["A>C", "A>G", "A>T", "C>G", "C>T", "G>T"]
-    path_full_vcf = options.rawvcf
-    path_filtered_vcf = options.filtered_vcf
-    path_bedfile = options.exom_bedfile
-    outpath = options.output
-    full_vcf = VCF(path_full_vcf)
-    filter_vcf = VCF(path_filtered_vcf)
-    # header = filter_vcf.raw_header
-    sample = filter_vcf.samples[1]
-    # read bed file
-    bed_intervals = tabix.open(path_bedfile)
-    full_v_count = process_vcf_file(full_vcf)
-    (
-        passed_v_count,
-        v_inside_exom,
-        v_outside_exom,
-        n_snps,
-        n_indels,
-        indels_length,
-        minimal_rp_snvs_exom,
-        minimal_rp_snvs_nexom,
-        limited_rp_snvs_exom,
-        limited_rp_snvs_nexom,
-        strong_rp_snvs_exom,
-        strong_rp_snvs_nexom,
-        mt_classes,
-        vaf,
-    ) = process_vcf_file(filter_vcf, bed_intervals, filter_file=True, padding=options.padding)
-    # After one for loop through vcf file. The VCF will automatically be closed.
-    # length_exoms = calculate_exoms_length(bed_intervals)
-    # for variant in filter_vcf:
-    #     print(variant.format("AF"))
-    summary = {
-        "sample": sample,
-        "number_full_variants": full_v_count,
-        "number_passed_variants": passed_v_count,
-        "number_snvs_inside_exom": v_inside_exom,
-        "number_snvs_outside_exom": v_outside_exom,
-        "number_snps": n_snps,
-        "number_indels": n_indels,
-        "indels_length": indels_length,
-        "minimal_rp_snvs_exom": minimal_rp_snvs_exom,
-        "minimal_rp_snvs_nexom": minimal_rp_snvs_nexom,
-        "limited_rp_snvs_exom": limited_rp_snvs_exom,
-        "limited_rp_snvs_nexom": limited_rp_snvs_nexom,
-        "strong_rp_snvs_exom": strong_rp_snvs_exom,
-        "strong_rp_snvs_nexom": strong_rp_snvs_nexom,
-        "mutation_classes": mt_classes,
-        "classes": classes,
-        "VAF": vaf,
-    }
-    json_object = json.dumps(summary)
-
-    with open(outpath, "w") as outfile:
-        outfile.write(json_object)
-
-
-if __name__ == "__main__":
-    main()

From de363703351bcf851edcb7839a4d4634e73cbf85 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Mon, 3 Jul 2023 20:48:04 +0200
Subject: [PATCH 04/17] adding test for somatic_variant_checking

---
 .../summarize-vcf.py                          |   3 +-
 ...test_workflows_somatic_variant_checking.py | 187 ++++++++++++++++++
 2 files changed, 188 insertions(+), 2 deletions(-)
 create mode 100644 tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py

diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index 09b8f9bfa..051c41921 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -1,7 +1,6 @@
 # import packages
-
-import json  # for writing out the json file
 from cyvcf2 import VCF  # for reading vcf file
+import json  # for writing out the json file
 import gzip
 import tabix
 import argparse
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
new file mode 100644
index 000000000..378a2fdfe
--- /dev/null
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+"""Tests for the somatic_variant_calling workflow module code"""
+
+import textwrap
+
+import pytest
+import ruamel.yaml as ruamel_yaml
+
+from snappy_pipeline.workflows.somatic_variant_checking import (
+    SomaticVariantQCWorkflow,
+)
+
+from .common import get_expected_log_files_dict, get_expected_output_json_files_dict
+from .conftest import patch_module_fs
+
+
+@pytest.fixture(scope="module")  # otherwise: performance issues
+def minimal_config():
+    """Return YAML parsing result for configuration"""
+    yaml = ruamel_yaml.YAML()
+    return yaml.load(
+        textwrap.dedent(
+            r"""
+        static_data_config:
+          reference:
+            path: /path/to/ref.fa
+          cosmic:
+            path: /path/to/cosmic.vcf.gz
+          dbsnp:
+            path: /path/to/dbsnp.vcf.gz
+
+        step_config:
+          ngs_mapping:
+            tools:
+              dna: ['bwa']
+            compute_coverage_bed: true
+            path_target_regions: /path/to/regions.bed
+            bwa:
+              path_index: /path/to/bwa/index.fa
+
+          somatic_variant_calling:
+            tools:
+            - mutect2
+            - scalpel
+            scalpel:
+              path_target_regions: /path/to/target/regions.bed
+
+          somatic_variant_checking:
+            path_somatic_variant_calling: ../somatic_variant_calling   
+            tools_ngs_mapping: []      
+            tools_somatic_variant_calling: []  
+            target_regions: /path/to/regions.bed
+
+        data_sets:
+          first_batch:
+            file: sheet.tsv
+            search_patterns:
+            - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'}
+            search_paths: ['/path']
+            type: matched_cancer
+            naming_scheme: only_secondary_id
+        """
+        ).lstrip()
+    )
+
+
+@pytest.fixture
+def somatic_variant_checking_workflow(
+    dummy_workflow,
+    minimal_config,
+    config_lookup_paths,
+    work_dir,
+    config_paths,
+    cancer_sheet_fake_fs,
+    mocker,
+):
+    """Return SomaticVariantCheckingWorkflow object pre-configured with cancer sheet"""
+    # Patch out file-system related things in abstract (the crawling link in step is defined there)
+    patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker)
+    # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we
+    # can obtain paths from the function as if we really had a NGSMappingPipelineStep there
+    dummy_workflow.globals = {
+        "ngs_mapping": lambda x: "NGS_MAPPING/" + x,
+        "somatic_variant_calling": lambda x: "SOMATIC_VARIANT_CALLING/" + x,
+    }
+    # Construct the workflow object
+    return SomaticVariantQCWorkflow(
+        dummy_workflow,
+        minimal_config,
+        config_lookup_paths,
+        config_paths,
+        work_dir,
+    )
+
+# Tests for SomaticVariantCheckingStepPart -----------------------------------------------------
+
+
+def test_somatic_variant_checking_step_part_get_input_files(somatic_variant_checking_workflow):
+    """Test SomaticVariantCheckingStepPart.get_input_files()"""
+    base_out = (
+        "SOMATIC_VARIANT_CALLING/output/{mapper}.{var_caller}.{tumor_library}/out/"
+        "{mapper}.{var_caller}.{tumor_library}"
+    )
+    expected = {
+        "full_vcf": base_out + ".full.vcf.gz",
+        "full_vcf_tbi": base_out + ".full.vcf.gz.tbi",
+        "passed_vcf": base_out + ".vcf.gz",
+        "passed_vcf_tbi": base_out + ".vcf.gz.tbi",
+    }
+    actual = somatic_variant_checking_workflow.get_input_files("SVQC_step", "run")
+    assert actual == expected
+
+
+def test_somatic_variant_checking_step_part_get_output_files(somatic_variant_checking_workflow):
+    """Tests SomaticVariantCheckingStepPart.get_output_files()"""
+    base_out = (
+        "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/"
+        "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+    )
+    expected = get_expected_output_json_files_dict(base_out=base_out)
+    actual = somatic_variant_checking_workflow.get_output_files("SVQC_step", "run")
+    assert actual == expected
+
+
+def test_somatic_variant_checking_step_part_get_log_files(somatic_variant_checking_workflow):
+    """Tests SomaticVariantCheckingStepPart.get_log_files()"""
+    base_out = (
+        "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/"
+        "{mapper}.{var_caller}.variantsqc.{tumor_library}"
+    )
+    expected = get_expected_log_files_dict(base_out=base_out)
+    actual = somatic_variant_checking_workflow.get_log_file("SVQC_step", "run")
+    assert actual == expected
+  
+
+def test_somatic_variant_checking_step_part_get_resource_usage(
+    somatic_variant_checking_workflow,
+):
+    """Tests SomaticVariantCheckingStepPart.get_resource_usage()"""
+    # Define expected
+    expected_dict = {"threads": 2, "time": "1:00:00", "memory": "4096M"}
+    # Evaluate
+    for resource, expected in expected_dict.items():
+        msg_error = f"Assertion error for resource '{resource}'."
+        actual = somatic_variant_checking_workflow.get_resource("SVQC_step", "run", resource)
+        assert actual == expected, msg_error
+
+
+# Tests for SomaticVariantCheckingWorkflow -------------------------------------------------------
+
+
+def test_somatic_variant_checking_workflow(somatic_variant_checking_workflow):
+    """Test simple functionality of the workflow"""
+    # Check created sub steps
+    expected = ["SVQC_step","link_out"]
+    actual = list(sorted(somatic_variant_checking_workflow.sub_steps.keys()))
+    assert actual == expected
+
+    # Check result file construction
+    tpl = (
+        "output/{mapper}.{var_caller}.variantsqc.P00{i}-T{t}-DNA1-WGS1/{dir_}/"
+        "{mapper}.{var_caller}.variantsqc.P00{i}-T{t}-DNA1-WGS1.{ext}"
+    )
+    expected = [
+        tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext, dir_="out")
+        for i, t in ((1, 1), (2, 1), (2, 2))
+        for ext in ("json", "json.md5")
+        for mapper in ("bwa",)
+        for var_caller in ("mutect2", "scalpel")
+    ]
+    expected += [
+        tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext, dir_="log")
+        for i, t in ((1, 1), (2, 1), (2, 2))
+        for ext in (
+            "conda_info.txt",
+            "conda_list.txt",
+            "log",
+            "conda_info.txt.md5",
+            "conda_list.txt.md5",
+            "log.md5",
+        )
+        for mapper in ("bwa",)
+        for var_caller in ("mutect2", "scalpel")
+    ]
+    expected = list(sorted(expected))
+    actual = list(sorted(somatic_variant_checking_workflow.get_result_files()))
+    assert expected == actual

From 09d808877a18b8a8c566cc1708aed10cc8611aaf Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Mon, 3 Jul 2023 20:56:24 +0200
Subject: [PATCH 05/17] format checking

---
 .../test_workflows_somatic_variant_checking.py        | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
index 378a2fdfe..a1c539bd6 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
@@ -46,9 +46,9 @@ def minimal_config():
               path_target_regions: /path/to/target/regions.bed
 
           somatic_variant_checking:
-            path_somatic_variant_calling: ../somatic_variant_calling   
-            tools_ngs_mapping: []      
-            tools_somatic_variant_calling: []  
+            path_somatic_variant_calling: ../somatic_variant_calling
+            tools_ngs_mapping: []
+            tools_somatic_variant_calling: []
             target_regions: /path/to/regions.bed
 
         data_sets:
@@ -92,6 +92,7 @@ def somatic_variant_checking_workflow(
         work_dir,
     )
 
+
 # Tests for SomaticVariantCheckingStepPart -----------------------------------------------------
 
 
@@ -131,7 +132,7 @@ def test_somatic_variant_checking_step_part_get_log_files(somatic_variant_checki
     expected = get_expected_log_files_dict(base_out=base_out)
     actual = somatic_variant_checking_workflow.get_log_file("SVQC_step", "run")
     assert actual == expected
-  
+
 
 def test_somatic_variant_checking_step_part_get_resource_usage(
     somatic_variant_checking_workflow,
@@ -152,7 +153,7 @@ def test_somatic_variant_checking_step_part_get_resource_usage(
 def test_somatic_variant_checking_workflow(somatic_variant_checking_workflow):
     """Test simple functionality of the workflow"""
     # Check created sub steps
-    expected = ["SVQC_step","link_out"]
+    expected = ["SVQC_step", "link_out"]
     actual = list(sorted(somatic_variant_checking_workflow.sub_steps.keys()))
     assert actual == expected
 

From 73416b04584ec2cdd15f20287742ff7ed5ef7a3d Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Mon, 3 Jul 2023 21:02:16 +0200
Subject: [PATCH 06/17] fixing format

---
 .../wrappers/somatic_variants_checking/summarize-vcf.py    | 7 ++++---
 .../wrappers/somatic_variants_checking/wrapper.py          | 1 +
 .../workflows/test_workflows_somatic_variant_checking.py   | 4 +---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index 051c41921..38d634d1a 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -1,9 +1,10 @@
 # import packages
-from cyvcf2 import VCF  # for reading vcf file
-import json  # for writing out the json file
+import argparse
 import gzip
+import json  # for writing out the json file
+
+from cyvcf2 import VCF  # for reading vcf file
 import tabix
-import argparse
 
 parser = argparse.ArgumentParser(description="Gathering information of variants")
 parser.add_argument(
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
index c7b0239cc..13295af50 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
@@ -3,6 +3,7 @@
 somatic variant calling step
 """
 import os
+
 from snakemake.shell import shell
 
 __author__ = "Pham Gia Cuong"
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
index a1c539bd6..2882fbd2b 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py
@@ -6,9 +6,7 @@
 import pytest
 import ruamel.yaml as ruamel_yaml
 
-from snappy_pipeline.workflows.somatic_variant_checking import (
-    SomaticVariantQCWorkflow,
-)
+from snappy_pipeline.workflows.somatic_variant_checking import SomaticVariantQCWorkflow
 
 from .common import get_expected_log_files_dict, get_expected_output_json_files_dict
 from .conftest import patch_module_fs

From c30f22e4affa1c4f34ea8d7c76cbcf38d654615c Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Sat, 8 Jul 2023 23:37:10 +0200
Subject: [PATCH 07/17] Finalizing somatic variant checking

---
 .../somatic_variant_checking/__init__.py      |  14 +-
 .../workflows/somatic_variant_qc/Snakefile    |  64 -----
 .../workflows/somatic_variant_qc/__init__.py  | 223 ------------------
 .../tumor_mutational_burden/__init__.py       |   6 +-
 .../wrappers/bcftools/TMB/wrapper.py          |  24 +-
 .../environment.yaml                          |   4 +-
 .../summarize-vcf.py                          | 125 +++++-----
 .../somatic_variants_checking/wrapper.py      |   1 +
 8 files changed, 90 insertions(+), 371 deletions(-)
 delete mode 100644 snappy_pipeline/workflows/somatic_variant_qc/Snakefile
 delete mode 100644 snappy_pipeline/workflows/somatic_variant_qc/__init__.py

diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
index 5c1367333..88254465f 100644
--- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
+++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
@@ -27,10 +27,11 @@
         tools_ngs_mapping: []      # default to those configured for ngs_mapping
         tools_somatic_variant_calling: []  # default to those configured for somatic_variant_calling
         target_regions: # REQUIRED
-        padding: 0  #Used for count the number of variants outside of exom + padding
-        ignore_regions: []
-        minimal_support_read: 1
-        limited_support_read: 5
+        padding: 0  # Used for count the number of variants outside of exom + padding
+        AF_ID: 'AF' # REQUIRED ID of allele frequency field used in vcf file
+        ignore_regions: "" # hard mapping regions
+        minimal_support_read: 1 # threshold for defining a variant that has minimal support reads
+        limited_support_read: 5 # threshold for defining a variant that has limited support reads
 """
 
 
@@ -189,10 +190,7 @@ def _yield_result_files_matched(self, tpl, **kwargs):
         """
         for sheet in filter(is_not_background, self.shortcut_sheets):
             for sample_pair in sheet.all_sample_pairs:
-                if (
-                    not sample_pair.tumor_sample.dna_ngs_library
-                    or not sample_pair.normal_sample.dna_ngs_library
-                ):
+                if not sample_pair.tumor_sample.dna_ngs_library:
                     msg = (
                         "INFO: sample pair for cancer bio sample {} has is missing primary"
                         "normal or primary cancer NGS library"
diff --git a/snappy_pipeline/workflows/somatic_variant_qc/Snakefile b/snappy_pipeline/workflows/somatic_variant_qc/Snakefile
deleted file mode 100644
index 2ac7ab432..000000000
--- a/snappy_pipeline/workflows/somatic_variant_qc/Snakefile
+++ /dev/null
@@ -1,64 +0,0 @@
-# -*- coding: utf-8 -*-
-"""CUBI Pipeline tumor mutational burden step Snakefile"""
-
-import os
-
-from snappy_pipeline import expand_ref
-from snappy_pipeline.workflows.somatic_variant_qc import (
-    SomaticVariantQCWorkflow,
-)
-
-__author__ = "Gia Cuong Pham"
-__email__ = "pham.gia-cuong@bih-charite.de"
-
-
-# Configuration ===============================================================
-
-
-configfile: "config.yaml"
-
-
-# Expand "$ref" JSON pointers in configuration (also works for YAML)
-config, lookup_paths, config_paths = expand_ref("config.yaml", config)
-
-# WorkflowImpl Object Setup ===================================================
-wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
-
-
-localrules:
-    # Linking files from work/ to output/ should be done locally
-    somatic_variant_qc_link_out_run,
-
-
-rule all:
-    input:
-        wf.get_result_files(),
-
-
-# Generic linking out ---------------------------------------------------------
-
-
-rule somatic_variant_qc_link_out_run:
-    input:
-        wf.get_input_files("link_out", "run"),
-    output:
-        wf.get_output_files("link_out", "run"),
-    run:
-        shell(wf.get_shell_cmd("link_out", "run", wildcards))
-
-
-rule somatic_variant_qc:
-    input:
-        **wf.get_input_files("SVQC_step", "run"),
-    output:
-        **wf.get_output_files("SVQC_step", "run"),
-    threads: wf.get_resource("SVQC_step", "run", "threads")
-    resources:
-        time=wf.get_resource("SVQC_step", "run", "time"),
-        memory=wf.get_resource("SVQC_step", "run", "memory"),
-        partition=wf.get_resource("SVQC_step", "run", "partition"),
-        tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"),
-    log:
-        **wf.get_log_file("SVQC_step", "run"),
-    wrapper:
-        wf.wrapper_path("somatic_variants_qc")
diff --git a/snappy_pipeline/workflows/somatic_variant_qc/__init__.py b/snappy_pipeline/workflows/somatic_variant_qc/__init__.py
deleted file mode 100644
index 5ce9c4349..000000000
--- a/snappy_pipeline/workflows/somatic_variant_qc/__init__.py
+++ /dev/null
@@ -1,223 +0,0 @@
-from collections import OrderedDict
-import os
-import sys
-
-from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
-from snakemake.io import expand
-
-from snappy_pipeline.base import UnsupportedActionException
-from snappy_pipeline.utils import dictify, listify
-from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
-from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage
-from snappy_pipeline.workflows.somatic_variant_calling import (
-    SOMATIC_VARIANT_CALLERS_MATCHED,
-    SomaticVariantCallingWorkflow,
-)
-
-#: Extensions of files to create as main payload
-EXT_VALUES = (".json", ".json.md5")
-
-#: Names of the files to create for the extension
-EXT_NAMES = ("json", "json_md5")
-
-#: Default configuration for the tmb calculation step
-DEFAULT_CONFIG = r"""
-step_config:
-    somatic_variant_qc:
-        path_somatic_variant_calling: ../somatic_variant_calling   # REQUIRED
-        tools_ngs_mapping: []      # default to those configured for ngs_mapping
-        tools_somatic_variant_calling: []  # default to those configured for somatic_variant_calling
-        target_regions: # REQUIRED
-        padding: 0  #Used for count the number of variants outside of exom + padding
-        repeated_regions: [] #need to confirm
-"""
-
-
-class SomaticVariantQCStepPart(BaseStepPart):
-    """"""
-
-    name = "SVQC_step"
-
-    actions = ("run",)
-
-    def __init__(self, parent):
-        super().__init__(parent)
-        self.tumor_ngs_library_to_sample_pair = OrderedDict()
-        for sheet in self.parent.shortcut_sheets:
-            # update function of OrderedDict
-            self.tumor_ngs_library_to_sample_pair.update(
-                sheet.all_sample_pairs_by_tumor_dna_ngs_library
-            )
-        # Build mapping from donor name to donor.
-        self.donors = OrderedDict()
-        for sheet in self.parent.shortcut_sheets:
-            for donor in sheet.donors:
-                self.donors[donor.name] = donor
-
-    @dictify
-    def get_input_files(self, action):
-        self._validate_action(action)
-        tpl = (
-            "output/{mapper}.{var_caller}.{tumor_library}/out/"
-            "{mapper}.{var_caller}.{tumor_library}"
-        )
-        key_ext = {
-            "full_vcf": ".full.vcf.gz",
-            "full_vcf_tbi": ".full.vcf.gz.tbi",
-            "passed_vcf": ".vcf.gz",
-            "passed_vcf_tbi": ".vcf.gz.tbi",
-        }
-        variant_calling = self.parent.sub_workflows["somatic_variant_calling"]
-        for key, ext in key_ext.items():
-            yield key, variant_calling(tpl + ext)
-
-    @dictify
-    def get_output_files(self, action):
-        # Validate action
-        self._validate_action(action)
-        prefix = (
-            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/"
-            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
-        )
-        key_ext = {"json": ".json"}
-        for key, ext in key_ext.items():
-            yield key, prefix + ext
-            yield key + "_md5", prefix + ext + ".md5"
-
-    @dictify
-    def _get_log_file(self, action):
-        assert action == "run"
-        prefix = (
-            "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/"
-            "{mapper}.{var_caller}.variantsqc.{tumor_library}"
-        )
-
-        key_ext = (
-            ("log", ".log"),
-            ("conda_info", ".conda_info.txt"),
-            ("conda_list", ".conda_list.txt"),
-        )
-        for key, ext in key_ext:
-            yield key, prefix + ext
-
-    def get_resource_usage(self, action):
-        """Get Resource Usage
-        :param action: Action (i.e., step) in the workflow, example: 'run'.
-        :type action: str
-        :return: Returns ResourceUsage for step.
-        :raises UnsupportedActionException: if action not in class defined list of valid actions.
-        """
-        if action not in self.actions:
-            actions_str = ", ".join(self.actions)
-            error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
-            raise UnsupportedActionException(error_message)
-        mem_mb = 4 * 1024  # 4GB
-        return ResourceUsage(
-            threads=4,
-            time="1:00:00",  # 1 hour
-            memory=f"{mem_mb}M",
-        )
-
-
-class SomaticVariantQCWorkflow(BaseStep):
-    """Perform gathering variant information"""
-
-    name = "somaticvariantqc"
-    sheet_shortcut_class = CancerCaseSheet
-    sheet_shortcut_kwargs = {
-        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
-    }
-
-    @classmethod
-    def default_config_yaml(cls):
-        """Return default config YAML, to be overwritten by project-specific one."""
-        return DEFAULT_CONFIG
-
-    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
-        super().__init__(
-            workflow,
-            config,
-            config_lookup_paths,
-            config_paths,
-            workdir,
-            (SomaticVariantCallingWorkflow, NgsMappingWorkflow),
-        )
-        # Register sub step classes so the sub steps are available
-        self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart))
-        # Register sub workflows
-        self.register_sub_workflow(
-            "somatic_variant_calling",
-            self.w_config["step_config"]["somatic_variant_qc"]["path_somatic_variant_calling"],
-        )
-        # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
-        if not self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"]:
-            self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"] = self.w_config[
-                "step_config"
-            ]["ngs_mapping"]["tools"]["dna"]
-        if not self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]:
-            self.w_config["step_config"]["somatic_variant_qc"][
-                "tools_somatic_variant_calling"
-            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
-
-    @listify
-    def get_result_files(self):
-        callers = set(
-            self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]
-        )
-        name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}"
-        yield from self._yield_result_files_matched(
-            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
-            mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"],
-            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
-            ext=EXT_VALUES,
-        )
-        yield from self._yield_result_files_matched(
-            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
-            mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"],
-            caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
-            ext=(
-                ".log",
-                ".log.md5",
-                ".conda_info.txt",
-                ".conda_info.txt.md5",
-                ".conda_list.txt",
-                ".conda_list.txt.md5",
-            ),
-        )
-
-    def _yield_result_files_matched(self, tpl, **kwargs):
-        """Build output paths from path template and extension list.
-
-        This function returns the results from the matched somatic variant callers such as
-        Mutect.
-        """
-        for sheet in filter(is_not_background, self.shortcut_sheets):
-            for sample_pair in sheet.all_sample_pairs:
-                if (
-                    not sample_pair.tumor_sample.dna_ngs_library
-                    or not sample_pair.normal_sample.dna_ngs_library
-                ):
-                    msg = (
-                        "INFO: sample pair for cancer bio sample {} has is missing primary"
-                        "normal or primary cancer NGS library"
-                    )
-                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
-                    continue
-                yield from expand(
-                    tpl,
-                    tumor_library=[sample_pair.tumor_sample.dna_ngs_library],
-                    **kwargs,
-                )
-
-    def check_config(self):
-        """Check that the path to the NGS mapping is present"""
-        self.ensure_w_config(
-            ("step_config", "somatic_variant_qc", "path_somatic_variant_calling"),
-            "Path to variant calling not configured but required for somatic variant qc",
-        )
-
-        self.ensure_w_config(
-            ("step_config", "somatic_variant_qc", "target_regions"),
-            "Path to target_regions file (bed format)"
-            "not configured but required for somatic variant qc",
-        )
diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
index c7176d8fa..5b0d893c8 100644
--- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
+++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
@@ -7,8 +7,8 @@
 
 from snappy_pipeline.base import UnsupportedActionException
 from snappy_pipeline.utils import dictify, listify
-from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
-from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage
+from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart,ResourceUsage
+from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
 from snappy_pipeline.workflows.somatic_variant_calling import (
     SOMATIC_VARIANT_CALLERS_MATCHED,
     SomaticVariantCallingWorkflow,
@@ -134,7 +134,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             config_lookup_paths,
             config_paths,
             workdir,
-            (SomaticVariantCallingWorkflow, NgsMappingWorkflow),
+            (SomaticVariantCallingWorkflow, NgsMappingWorkflow), 
         )
         # Register sub step classes so the sub steps are available
         self.register_sub_step_classes((TumorMutationalBurdenCalculationStepPart, LinkOutStepPart))
diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
index 1408cff2a..70213e6a7 100644
--- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
@@ -39,18 +39,18 @@
 
 TMB=`echo "1000000*($number_variants/$total_exom_length)" | bc -l `
 cat << EOF > {snakemake.output.json}
-{
-"Library_name": {snakemake.wildcards.tumor_library},
-"VCF_file": $name_vcf,
-"VCF_md5": $vcf_md5,
-"BED_file": $bed_file_name,
-"BED_md5": $bed_md5,
-"TMB": $TMB,
-"Number_variants": $number_variants,
-"Number_snvs": $number_snvs,
-"Number_indels": $number_indels,
-"Total_regions_length": $total_exom_length
-}
+{{
+  "Library_name": "{snakemake.wildcards.tumor_library}",
+  "VCF_file": "$name_vcf",
+  "VCF_md5": "$vcf_md5",
+  "BED_file": "$bed_file_name",
+  "BED_md5": "$bed_md5",
+  "TMB": "$TMB",
+  "Number_variants": "$number_variants",
+  "Number_snvs": "$number_snvs",
+  "Number_indels": "$number_indels",
+  "Total_regions_length": "$total_exom_length"
+}}
 EOF
 pushd $(dirname {snakemake.output.json})
 md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5})
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml
index beb5b1f3f..80b4fa59b 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml
@@ -3,6 +3,4 @@ channels:
   - bioconda
 dependencies:
   - cyvcf2
-  - pip=20.0.2
-  - pip:
-    - pytabix
+  - pytabix
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index 38d634d1a..cc150a2d8 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -6,49 +6,11 @@
 from cyvcf2 import VCF  # for reading vcf file
 import tabix
 
-parser = argparse.ArgumentParser(description="Gathering information of variants")
-parser.add_argument(
-    "--rawvcf",
-    help="raw vcf file",
-)
-parser.add_argument(
-    "--filtered-vcf",
-    help="filtered vcf file",
-)
-parser.add_argument(
-    "--exom-bedfile",
-    help="exom regions bed file",
-)
-parser.add_argument("--padding", help="it's padding", default=0)
-parser.add_argument(
-    "--ignore-regions",
-    help="bed file contains repeated or difficult to map regions",
-)
-parser.add_argument(
-    "-m",
-    "--minimal",
-    type=int,
-    default=1,
-    help="threshold for defining a variant that has minimal support reads",
-)
-parser.add_argument(
-    "-l",
-    "--limited",
-    type=int,
-    default=5,
-    help="threshold for defining a variant that has limited support reads",
-)
-parser.add_argument(
-    "--output",
-    help="json file contains information about vcf file",
-)
-args = parser.parse_args()
-
 
 def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0):
     try:
         tmp = []
-        records = bed_intervals.query(v_chrom, v_start - padding, v_end + padding)
+        records = bed_intervals.query(v_chrom, v_start - int(padding), v_end + int(padding))
         for record in records:
             tmp.append(record)
         if len(tmp) >= 1:
@@ -81,7 +43,7 @@ def get_variant_type(ref, alt):
 
 
 def check_sp_read(variant, minimal, limited):
-    dp = variant.INFO["DP"]
+    dp = variant.format("AD")[1][0]
     if dp <= minimal:
         return "minimal"
     elif (dp > minimal) and (dp <= limited):
@@ -118,6 +80,7 @@ def process_vcf_file(
     hard_file="",
     filter_file=False,
     padding=0,
+    id_af="AF",
 ):
     infor = {
         "total_v_count": 0,
@@ -145,10 +108,10 @@ def process_vcf_file(
                     raise ValueError(
                         "Multiallelic variants are not supported"
                     )  # checking for multiallelic
-                infor["vaf"].append(float(variant.format("AF")[1][0]))
+                infor["vaf"].append(float(variant.format(id_af)[1][0]))
                 # Counting variants in or out of exom and variants with different support reads
                 if check_variant_in_bed(
-                    variant.CHROM, variant.start, variant.end, bed_file, padding=0
+                    variant.CHROM, variant.start, variant.end, bed_file, padding
                 ):
                     infor["v_inside_exom"] += 1
                     infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_exom"] += 1
@@ -168,7 +131,7 @@ def process_vcf_file(
 
             if variant.CHROM in hard_contigs:
                 if check_variant_in_bed(
-                    variant.CHROM, variant.start, variant.end, hard_file, padding=0
+                    variant.CHROM, variant.start, variant.end, hard_file, padding
                 ):
                     infor["v_in_hard_regions"] += 1
         return infor
@@ -179,6 +142,47 @@ def process_vcf_file(
         return total_v_count
 
 
+parser = argparse.ArgumentParser(description="Gathering information of variants")
+parser.add_argument(
+    "--rawvcf",
+    help="vcf file containing all called somatic variants, before filtration",
+    required=True,
+)
+parser.add_argument(
+    "--filtered-vcf", help="vcf file containing somatic variants passing all filters", required=True
+)
+parser.add_argument("--exom-bedfile", help="exom regions bed file", required=True)
+parser.add_argument("--padding", type=int, help="Size of padding around exom regions", default=0)
+parser.add_argument(
+    "--ignore-regions",
+    help="bed file contains repeated or difficult to map regions",
+)
+parser.add_argument(
+    "--AF",
+    default="AF",
+    help="ID of allele frequency in the vcf file",
+)
+parser.add_argument(
+    "-m",
+    "--minimal",
+    type=int,
+    default=1,
+    help="threshold for defining a variant that has minimal support reads",
+)
+parser.add_argument(
+    "-l",
+    "--limited",
+    type=int,
+    default=5,
+    help="threshold for defining a variant that has limited support reads",
+)
+parser.add_argument(
+    "--output",
+    help="json file contains information about vcf file, if not given, the tool will print it out",
+)
+args = parser.parse_args()
+
+
 def main():
     classes = ["T>G", "T>C", "T>A", "C>G", "C>T", "C>A"]
     # reading input
@@ -204,6 +208,7 @@ def main():
             hard_intervals,
             filter_file=True,
             padding=args.padding,
+            id_af=args.AF,
         )
     else:
         infor = process_vcf_file(
@@ -216,23 +221,24 @@ def main():
             "",
             filter_file=True,
             padding=args.padding,
+            id_af=args.AF,
         )
 
     summary = {
         "sample": sample,
-        "number_full_variants": full_v_count,
-        "number_passed_variants": infor["total_v_count"],
-        "number_snvs_inside_exom": infor["v_inside_exom"],
-        "number_snvs_outside_exom": infor["v_outside_exom"],
-        "number_snvs_in_hard_regions": infor["v_in_hard_regions"],
-        "number_snps": infor["n_snps"],
+        "total_variants_number": full_v_count,
+        "number_variants_passing_filters": infor["total_v_count"],
+        "number_variants_in_enrichment_regions": infor["v_inside_exom"],
+        "number_variants_outside_enrichment_regions": infor["v_outside_exom"],
+        "number_variants_in_masked_regions": infor["v_in_hard_regions"],
+        "number_snvs": infor["n_snps"],
         "number_indels": infor["n_indels"],
-        "minimal_rp_snvs_exom": infor["minimal_rp_snvs_exom"],
-        "minimal_rp_snvs_nexom": infor["minimal_rp_snvs_nexom"],
-        "limited_rp_snvs_exom": infor["limited_rp_snvs_exom"],
-        "limited_rp_snvs_nexom": infor["limited_rp_snvs_nexom"],
-        "strong_rp_snvs_exom": infor["strong_rp_snvs_exom"],
-        "strong_rp_snvs_nexom": infor["strong_rp_snvs_nexom"],
+        "number_variants_in_enriched_with_minimal_support": infor["minimal_rp_snvs_exom"],
+        "number_variants_outside_enriched_with_minimal_support": infor["minimal_rp_snvs_nexom"],
+        "number_variants_in_enriched_with_limited_support": infor["limited_rp_snvs_exom"],
+        "number_variants_outside_enriched_with_limited_support": infor["limited_rp_snvs_nexom"],
+        "number_variants_in_enriched_with_strong_support": infor["strong_rp_snvs_exom"],
+        "number_variants_outside_enriched_with_strong_support": infor["strong_rp_snvs_nexom"],
         classes[0]: infor["mt_classes"][0],
         classes[1]: infor["mt_classes"][1],
         classes[2]: infor["mt_classes"][2],
@@ -245,9 +251,12 @@ def main():
     if not args.ignore_regions:
         del summary["number_snvs_in_hard_regions"]
     json_object = json.dumps(summary)
-
-    with open(args.output, "w") as outfile:
-        outfile.write(json_object)
+    if args.output:
+        with open(args.output, "w") as outfile:
+            outfile.write(json_object)
+        outfile.close()
+    else:
+        print(json_object)
 
 
 if __name__ == "__main__":
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
index 13295af50..72c686cbf 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
@@ -29,6 +29,7 @@
     --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \
     --limited {snakemake.config[step_config][somatic_variant_checking][limited_support_read]} \
     --ignore-regions {snakemake.config[step_config][somatic_variant_checking][ignore_regions]} \
+    --AF {snakemake.config[step_config][somatic_variant_checking][AF_ID]} \
     --output {snakemake.output.json}
 
 pushd $(dirname {snakemake.output.json})

From 956904f15c127b8913c6cf15495207c5bcfbf482 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Sun, 9 Jul 2023 00:09:44 +0200
Subject: [PATCH 08/17] finalize somatic variants checking

---
 .../workflows/somatic_variant_checking/__init__.py       | 9 +++------
 .../wrappers/somatic_variants_checking/summarize-vcf.py  | 4 +++-
 .../wrappers/somatic_variants_checking/wrapper.py        | 6 +++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
index 88254465f..369d52a28 100644
--- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
+++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
@@ -76,8 +76,8 @@ def get_output_files(self, action):
             yield key + "_md5", prefix + ext + ".md5"
 
     @dictify
-    def _get_log_file(self, action):
-        assert action == "run"
+    def _get_log_file(self, action): 
+        self._validate_action(action=action)
         prefix = (
             "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/"
             "{mapper}.{var_caller}.variantsqc.{tumor_library}"
@@ -98,10 +98,7 @@ def get_resource_usage(self, action):
         :return: Returns ResourceUsage for step.
         :raises UnsupportedActionException: if action not in class defined list of valid actions.
         """
-        if action not in self.actions:
-            actions_str = ", ".join(self.actions)
-            error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
-            raise UnsupportedActionException(error_message)
+        self._validate_action(action=action)
         mem_mb = 4 * 1024  # 4GB
         return ResourceUsage(
             threads=2,
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index cc150a2d8..dc0bdaf5f 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -128,7 +128,9 @@ def process_vcf_file(
                     infor["n_indels"] += 1
                     infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0])))
             # Gathering information of variants in comparison to hard mapped regions
-
+            else:
+                infor["v_outside_exom"] += 1
+                
             if variant.CHROM in hard_contigs:
                 if check_variant_in_bed(
                     variant.CHROM, variant.start, variant.end, hard_file, padding
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
index 72c686cbf..856a254ea 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
@@ -40,8 +40,8 @@
 # Compute MD5 sums of logs
 shell(
     r"""
-md5sum {snakemake.log.log} > {snakemake.log.log_md5}
-md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5}
-md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5}
+md5sum $(basename {snakemake.log.log}) > {snakemake.log.log_md5}
+md5sum $(basename {snakemake.log.conda_list}) > {snakemake.log.conda_list_md5}
+md5sum $(basename {snakemake.log.conda_info}) > {snakemake.log.conda_info_md5}
 """
 )

From 4ce11597c8f849a0938c17bb3a09239ffded14a2 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Sun, 9 Jul 2023 00:33:59 +0200
Subject: [PATCH 09/17] Fix typo

---
 .../workflows/somatic_variant_checking/__init__.py   |  3 +--
 .../workflows/tumor_mutational_burden/__init__.py    | 12 ++++--------
 .../somatic_variants_checking/summarize-vcf.py       |  1 -
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
index 369d52a28..ba2f6e703 100644
--- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
+++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
@@ -4,7 +4,6 @@
 from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
 from snakemake.io import expand
 
-from snappy_pipeline.base import UnsupportedActionException
 from snappy_pipeline.utils import dictify, listify
 from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
 from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage
@@ -76,7 +75,7 @@ def get_output_files(self, action):
             yield key + "_md5", prefix + ext + ".md5"
 
     @dictify
-    def _get_log_file(self, action): 
+    def _get_log_file(self, action):
         self._validate_action(action=action)
         prefix = (
             "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/"
diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
index 5b0d893c8..6d094f12a 100644
--- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
+++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
@@ -5,9 +5,8 @@
 from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
 from snakemake.io import expand
 
-from snappy_pipeline.base import UnsupportedActionException
 from snappy_pipeline.utils import dictify, listify
-from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart,ResourceUsage
+from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart, ResourceUsage
 from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
 from snappy_pipeline.workflows.somatic_variant_calling import (
     SOMATIC_VARIANT_CALLERS_MATCHED,
@@ -80,7 +79,7 @@ def get_output_files(self, action):
 
     @dictify
     def _get_log_file(self, action):
-        assert action == "run"
+        self._validate_action(action)
         prefix = (
             "work/{mapper}.{var_caller}.tmb.{tumor_library}/log/"
             "{mapper}.{var_caller}.tmb.{tumor_library}"
@@ -101,10 +100,7 @@ def get_resource_usage(self, action):
         :return: Returns ResourceUsage for step.
         :raises UnsupportedActionException: if action not in class defined list of valid actions.
         """
-        if action not in self.actions:
-            actions_str = ", ".join(self.actions)
-            error_message = f"Action '{action}' is not supported. Valid options: {actions_str}"
-            raise UnsupportedActionException(error_message)
+        self._validate_action(action)
         mem_mb = 4 * 1024  # 4GB
         return ResourceUsage(
             threads=2,
@@ -134,7 +130,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             config_lookup_paths,
             config_paths,
             workdir,
-            (SomaticVariantCallingWorkflow, NgsMappingWorkflow), 
+            (SomaticVariantCallingWorkflow, NgsMappingWorkflow),
         )
         # Register sub step classes so the sub steps are available
         self.register_sub_step_classes((TumorMutationalBurdenCalculationStepPart, LinkOutStepPart))
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index dc0bdaf5f..484c621a9 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -130,7 +130,6 @@ def process_vcf_file(
             # Gathering information of variants in comparison to hard mapped regions
             else:
                 infor["v_outside_exom"] += 1
-                
             if variant.CHROM in hard_contigs:
                 if check_variant_in_bed(
                     variant.CHROM, variant.start, variant.end, hard_file, padding

From 1f5b0e0b4f6ddcb43a6137d515661eb84f4c2991 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Sun, 9 Jul 2023 00:37:19 +0200
Subject: [PATCH 10/17] fixing typo

---
 .../workflows/tumor_mutational_burden/__init__.py          | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
index 6d094f12a..0384b8cdb 100644
--- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
+++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
@@ -6,7 +6,12 @@
 from snakemake.io import expand
 
 from snappy_pipeline.utils import dictify, listify
-from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart, ResourceUsage
+from snappy_pipeline.workflows.abstract import (
+    BaseStep,
+    BaseStepPart,
+    LinkOutStepPart,
+    ResourceUsage,
+)
 from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
 from snappy_pipeline.workflows.somatic_variant_calling import (
     SOMATIC_VARIANT_CALLERS_MATCHED,

From e99ef3c32df71299055dfd55f9ca42762b9fbd8d Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 13 Jul 2023 16:31:45 +0200
Subject: [PATCH 11/17] fixing small bugs

---
 .../workflows/somatic_variant_checking/__init__.py     |  2 +-
 snappy_wrappers/wrappers/bcftools/TMB/wrapper.py       | 10 +++++-----
 snappy_wrappers/wrappers/mantis/run/wrapper.py         |  2 +-
 .../somatic_variants_checking/summarize-vcf.py         |  8 ++++----
 .../wrappers/somatic_variants_checking/wrapper.py      | 10 ++++++----
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
index ba2f6e703..d682274af 100644
--- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
+++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py
@@ -27,7 +27,7 @@
         tools_somatic_variant_calling: []  # default to those configured for somatic_variant_calling
         target_regions: # REQUIRED
         padding: 0  # Used for count the number of variants outside of exom + padding
-        AF_ID: 'AF' # REQUIRED ID of allele frequency field used in vcf file
+        variant_allele_frequency_id: 'AF' # REQUIRED ID of allele frequency field used in vcf file
         ignore_regions: "" # hard mapping regions
         minimal_support_read: 1 # threshold for defining a variant that has minimal support reads
         limited_support_read: 5 # threshold for defining a variant that has limited support reads
diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
index 70213e6a7..04f48f398 100644
--- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
@@ -45,11 +45,11 @@
   "VCF_md5": "$vcf_md5",
   "BED_file": "$bed_file_name",
   "BED_md5": "$bed_md5",
-  "TMB": "$TMB",
-  "Number_variants": "$number_variants",
-  "Number_snvs": "$number_snvs",
-  "Number_indels": "$number_indels",
-  "Total_regions_length": "$total_exom_length"
+  "TMB": $TMB,
+  "Number_variants": $number_variants,
+  "Number_snvs": $number_snvs,
+  "Number_indels": $number_indels,
+  "Total_regions_length": $total_exom_length
 }}
 EOF
 pushd $(dirname {snakemake.output.json})
diff --git a/snappy_wrappers/wrappers/mantis/run/wrapper.py b/snappy_wrappers/wrappers/mantis/run/wrapper.py
index 82fa7b8b4..3f2bf4e74 100644
--- a/snappy_wrappers/wrappers/mantis/run/wrapper.py
+++ b/snappy_wrappers/wrappers/mantis/run/wrapper.py
@@ -33,7 +33,7 @@
 # but should also work for reasonable deep WGS according to them.
 # https://github.com/OSU-SRLab/MANTIS/issues/25
 
-python /fast/groups/cubi/projects/biotools/Mantis/MANTIS/mantis.py \
+python /fast/groups/cubi/work/projects/biotools/Mantis/MANTIS/mantis.py \
     -t {snakemake.input.tumor_bam}  \
     -n {snakemake.input.normal_bam} \
     --genome {snakemake.config[static_data_config][reference][path]} \
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index 484c621a9..cc33c658f 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -43,7 +43,7 @@ def get_variant_type(ref, alt):
 
 
 def check_sp_read(variant, minimal, limited):
-    dp = variant.format("AD")[1][0]
+    dp = variant.format("AD")[1][1]
     if dp <= minimal:
         return "minimal"
     elif (dp > minimal) and (dp <= limited):
@@ -159,7 +159,7 @@ def process_vcf_file(
     help="bed file contains repeated or difficult to map regions",
 )
 parser.add_argument(
-    "--AF",
+    "--variant-allele-frequency-id",
     default="AF",
     help="ID of allele frequency in the vcf file",
 )
@@ -209,7 +209,7 @@ def main():
             hard_intervals,
             filter_file=True,
             padding=args.padding,
-            id_af=args.AF,
+            id_af=args.variant_allele_frequency_id,
         )
     else:
         infor = process_vcf_file(
@@ -222,7 +222,7 @@ def main():
             "",
             filter_file=True,
             padding=args.padding,
-            id_af=args.AF,
+            id_af=args.variant_allele_frequency_id,
         )
 
     summary = {
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
index 856a254ea..6279dba4a 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
@@ -29,19 +29,21 @@
     --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \
     --limited {snakemake.config[step_config][somatic_variant_checking][limited_support_read]} \
     --ignore-regions {snakemake.config[step_config][somatic_variant_checking][ignore_regions]} \
-    --AF {snakemake.config[step_config][somatic_variant_checking][AF_ID]} \
+    --variant-allele-frequency-id {snakemake.config[step_config][somatic_variant_checking][variant_allele_frequency_id]} \
     --output {snakemake.output.json}
 
 pushd $(dirname {snakemake.output.json})
 md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5})
+popd
     """
 )
 
 # Compute MD5 sums of logs
 shell(
     r"""
-md5sum $(basename {snakemake.log.log}) > {snakemake.log.log_md5}
-md5sum $(basename {snakemake.log.conda_list}) > {snakemake.log.conda_list_md5}
-md5sum $(basename {snakemake.log.conda_info}) > {snakemake.log.conda_info_md5}
+pushd $(dirname {snakemake.log.log})
+md5sum $(basename {snakemake.log.log}) > $(basename {snakemake.log.log_md5})
+md5sum $(basename {snakemake.log.conda_list}) > $(basename {snakemake.log.conda_list_md5})
+md5sum $(basename {snakemake.log.conda_info}) > $(basename {snakemake.log.conda_info_md5})
 """
 )

From 62b6e6f539765e5c7541763786360fe1628b1c77 Mon Sep 17 00:00:00 2001
From: ericblanc20 <eric.blanc@bihealth.de>
Date: Tue, 25 Jul 2023 14:51:37 +0200
Subject: [PATCH 12/17] fix: json format doesn't accept floats starting with .

- [Unintuitive JSON parsing](https://nullprogram.com/blog/2019/12/28/)
- [Dollar-parenthesis to be preferred to backticks for POSIX-compliance](https://stackoverflow.com/questions/9405478/command-substitution-backticks-or-dollar-sign-paren-enclosed)
---
 snappy_wrappers/wrappers/bcftools/TMB/wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
index 04f48f398..7dc086f9e 100644
--- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
@@ -37,7 +37,7 @@
 number_indels=$(bcftools view -R $bed_file -v indels --threads 2 -H {snakemake.input.vcf}| wc -l)
 number_variants=$(bcftools view -R $bed_file --threads 2 -H {snakemake.input.vcf}| wc -l)
 
-TMB=`echo "1000000*($number_variants/$total_exom_length)" | bc -l `
+TMB=$(echo "1000000*($number_variants/$total_exom_length)" | bc -l | xargs printf "%.17g")
 cat << EOF > {snakemake.output.json}
 {{
   "Library_name": "{snakemake.wildcards.tumor_library}",

From 5d5246a62bc331cac2efc4ed88883aa6190f47b6 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 27 Jul 2023 14:22:55 +0200
Subject: [PATCH 13/17] Finalizing somatic variant checking

---
 .../wrappers/bcftools/TMB/wrapper.py          |  2 +-
 .../summarize-vcf.py                          | 44 +++++++++++++------
 .../somatic_variants_checking/wrapper.py      |  1 +
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
index 04f48f398..7dc086f9e 100644
--- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py
@@ -37,7 +37,7 @@
 number_indels=$(bcftools view -R $bed_file -v indels --threads 2 -H {snakemake.input.vcf}| wc -l)
 number_variants=$(bcftools view -R $bed_file --threads 2 -H {snakemake.input.vcf}| wc -l)
 
-TMB=`echo "1000000*($number_variants/$total_exom_length)" | bc -l `
+TMB=$(echo "1000000*($number_variants/$total_exom_length)" | bc -l | xargs printf "%.17g")
 cat << EOF > {snakemake.output.json}
 {{
   "Library_name": "{snakemake.wildcards.tumor_library}",
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index cc33c658f..7c57a6602 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -36,14 +36,17 @@ def get_contigs_from_bed_file(bedfile):
 
 
 def get_variant_type(ref, alt):
-    if (len(alt) == 1) and (len(ref) == len(alt)):
-        return "snp"
-    elif len(alt) != len(ref):
-        return "indels"
+    if len(ref) == len(alt):
+        if len(alt) == 1:
+            return "SNV"
+        else:
+            return "ONV"
+    elif len(alt) < len(ref):
+        return "indel"
 
 
-def check_sp_read(variant, minimal, limited):
-    dp = variant.format("AD")[1][1]
+def check_sp_read(variant, pos_sample, minimal, limited):
+    dp = variant.format("AD")[1][pos_sample]
     if dp <= minimal:
         return "minimal"
     elif (dp > minimal) and (dp <= limited):
@@ -72,6 +75,7 @@ def assign_class_snvs(variant, mt_mat):
 
 def process_vcf_file(
     vcf_file,
+    pos_sample=1,
     contigs=[],
     hard_contigs=[],
     minimal=1,
@@ -88,6 +92,7 @@ def process_vcf_file(
         "v_outside_exom": 0,
         "v_in_hard_regions": 0,
         "n_snps": 0,
+        "n_onvs": 0,
         "n_indels": 0,
         "minimal_rp_snvs_exom": 0,
         "minimal_rp_snvs_nexom": 0,
@@ -114,19 +119,25 @@ def process_vcf_file(
                     variant.CHROM, variant.start, variant.end, bed_file, padding
                 ):
                     infor["v_inside_exom"] += 1
-                    infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_exom"] += 1
+                    infor[
+                        str(check_sp_read(variant, pos_sample, minimal, limited)) + "_rp_snvs_exom"
+                    ] += 1
                 else:
                     infor["v_outside_exom"] += 1
-                    infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_nexom"] += 1
+                    infor[
+                        str(check_sp_read(variant, pos_sample, minimal, limited)) + "_rp_snvs_nexom"
+                    ] += 1
 
                 # Need to check multi allelic. Users shouldn't input multi allelic vcf file.
-                if get_variant_type(variant.REF, variant.ALT[0]) == "snp":
+                if get_variant_type(variant.REF, variant.ALT[0]) == "SNV":
                     infor["n_snps"] += 1
                     infor["mt_classes"] = assign_class_snvs(variant, infor["mt_classes"])
-                else:
+                elif get_variant_type(variant.REF, variant.ALT[0]) == "indel":
                     # More for indels
                     infor["n_indels"] += 1
                     infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0])))
+                elif get_variant_type(variant.REF, variant.ALT[0]) == "ONV":
+                    infor["n_onvs"] += 1
             # Gathering information of variants in comparison to hard mapped regions
             else:
                 infor["v_outside_exom"] += 1
@@ -152,6 +163,10 @@ def process_vcf_file(
 parser.add_argument(
     "--filtered-vcf", help="vcf file containing somatic variants passing all filters", required=True
 )
+parser.add_argument(
+    "--sample",
+    help="Name of tumor sample",
+)
 parser.add_argument("--exom-bedfile", help="exom regions bed file", required=True)
 parser.add_argument("--padding", type=int, help="Size of padding around exom regions", default=0)
 parser.add_argument(
@@ -189,7 +204,7 @@ def main():
     # reading input
     full_vcf = VCF(args.rawvcf)
     filter_vcf = VCF(args.filtered_vcf)
-    sample = filter_vcf.samples[1]
+    pos_sample = filter_vcf.samples.index(args.sample)
     ######################
     bed_intervals = tabix.open(args.exom_bedfile)
     contigs = get_contigs_from_bed_file(args.exom_bedfile)
@@ -201,6 +216,7 @@ def main():
         hard_contigs = get_contigs_from_bed_file(path_hard_regions)
         infor = process_vcf_file(
             filter_vcf,
+            pos_sample,
             contigs,
             hard_contigs,
             args.minimal,
@@ -214,6 +230,7 @@ def main():
     else:
         infor = process_vcf_file(
             filter_vcf,
+            pos_sample,
             contigs,
             [],
             args.minimal,
@@ -226,13 +243,14 @@ def main():
         )
 
     summary = {
-        "sample": sample,
+        "sample": args.sample,
         "total_variants_number": full_v_count,
         "number_variants_passing_filters": infor["total_v_count"],
         "number_variants_in_enrichment_regions": infor["v_inside_exom"],
         "number_variants_outside_enrichment_regions": infor["v_outside_exom"],
         "number_variants_in_masked_regions": infor["v_in_hard_regions"],
         "number_snvs": infor["n_snps"],
+        "number_onvs": infor["n_onvs"],
         "number_indels": infor["n_indels"],
         "number_variants_in_enriched_with_minimal_support": infor["minimal_rp_snvs_exom"],
         "number_variants_outside_enriched_with_minimal_support": infor["minimal_rp_snvs_nexom"],
@@ -250,7 +268,7 @@ def main():
         "VAF": infor["vaf"],
     }
     if not args.ignore_regions:
-        del summary["number_snvs_in_hard_regions"]
+        del summary["number_variants_in_masked_regions"]
     json_object = json.dumps(summary)
     if args.output:
         with open(args.output, "w") as outfile:
diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
index 6279dba4a..909fe6ac1 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py
@@ -24,6 +24,7 @@
 
 python {base_dir}/summarize-vcf.py --rawvcf {snakemake.input.full_vcf} \
     --filtered-vcf {snakemake.input.passed_vcf} \
+    --sample {snakemake.wildcards.tumor_library} \
     --exom-bedfile {snakemake.config[step_config][somatic_variant_checking][target_regions]} \
     --padding {snakemake.config[step_config][somatic_variant_checking][padding]} \
     --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \

From 4750e75900eab7f066b5dba156138a57cc727253 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 27 Jul 2023 14:32:25 +0200
Subject: [PATCH 14/17] update mantis enviroment

---
 snappy_wrappers/wrappers/mantis/run/environment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snappy_wrappers/wrappers/mantis/run/environment.yaml b/snappy_wrappers/wrappers/mantis/run/environment.yaml
index 8426cdf70..062fb89c3 100644
--- a/snappy_wrappers/wrappers/mantis/run/environment.yaml
+++ b/snappy_wrappers/wrappers/mantis/run/environment.yaml
@@ -3,5 +3,5 @@ channels:
 - anaconda
 - bioconda
 dependencies:
-- numpy ==1.6.2
+- numpy ==1.7.2
 - pysam ==0.8.3

From e9cb75b3cea118e761039cc394cb078b21b341a5 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 2 Aug 2023 12:02:57 +0200
Subject: [PATCH 15/17] finializing somatic_variant_checking

---
 .../summarize-vcf.py                          | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
index 7c57a6602..f3a257c51 100644
--- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
+++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py
@@ -36,13 +36,17 @@ def get_contigs_from_bed_file(bedfile):
 
 
 def get_variant_type(ref, alt):
+    ref = ref.replace("-", "")
+    alt = alt.replace("-", "")
+    variant_type = "unknown"
     if len(ref) == len(alt):
-        if len(alt) == 1:
-            return "SNV"
+        if len(ref) == 1:
+            variant_type = "snv"
         else:
-            return "ONV"
-    elif len(alt) < len(ref):
-        return "indel"
+            variant_type = "onv"
+    else:
+        variant_type = "indel"
+    return variant_type
 
 
 def check_sp_read(variant, pos_sample, minimal, limited):
@@ -70,7 +74,6 @@ def assign_class_snvs(variant, mt_mat):
         mt_mat[4] += 1
     elif temp in ["G>T", "C>A"]:
         mt_mat[5] += 1
-    return mt_mat
 
 
 def process_vcf_file(
@@ -129,14 +132,14 @@ def process_vcf_file(
                     ] += 1
 
                 # Need to check multi allelic. Users shouldn't input multi allelic vcf file.
-                if get_variant_type(variant.REF, variant.ALT[0]) == "SNV":
+                if get_variant_type(variant.REF, variant.ALT[0]) == "snv":
                     infor["n_snps"] += 1
-                    infor["mt_classes"] = assign_class_snvs(variant, infor["mt_classes"])
+                    assign_class_snvs(variant, infor["mt_classes"])
                 elif get_variant_type(variant.REF, variant.ALT[0]) == "indel":
                     # More for indels
                     infor["n_indels"] += 1
                     infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0])))
-                elif get_variant_type(variant.REF, variant.ALT[0]) == "ONV":
+                elif get_variant_type(variant.REF, variant.ALT[0]) == "onv":
                     infor["n_onvs"] += 1
             # Gathering information of variants in comparison to hard mapped regions
             else:

From e9dbc5e52111daba1a873e410d6b1af68e4bd174 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 24 Aug 2023 09:42:58 +0200
Subject: [PATCH 16/17] fixing vep environment

---
 snappy_wrappers/wrappers/vep/run/environment.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/snappy_wrappers/wrappers/vep/run/environment.yaml b/snappy_wrappers/wrappers/vep/run/environment.yaml
index 48f171edf..eeb0710b4 100644
--- a/snappy_wrappers/wrappers/vep/run/environment.yaml
+++ b/snappy_wrappers/wrappers/vep/run/environment.yaml
@@ -1,8 +1,4 @@
 channels:
-    - conda-forge
     - bioconda
 dependencies:
-    - python
     - ensembl-vep=102
-    - htslib
-

From fdbdfff537b2d585d464219be4b7d8cf352f7ba1 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Mon, 20 Nov 2023 10:24:26 +0100
Subject: [PATCH 17/17] make isort happy

---
 snappy_pipeline/workflows/tumor_mutational_burden/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
index 6eb901d8d..e18d6cfd1 100644
--- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
+++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py
@@ -12,7 +12,6 @@
     ANNOTATION_TOOLS,
     SomaticVariantAnnotationWorkflow,
 )
-
 from snappy_pipeline.workflows.somatic_variant_calling import (
     SOMATIC_VARIANT_CALLERS_MATCHED,
     SomaticVariantCallingWorkflow,