From 72717b80642ac4a2ffc3d1afd5aa438539376bec Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 4 Apr 2024 11:07:11 +0200
Subject: [PATCH 01/23] Adding somatic neoepitope preparation workflow

---
 .../somatic_neoepitope_prediction/Snakefile   |  81 ++++
 .../somatic_neoepitope_prediction/__init__.py | 313 +++++++++++++-
 .../wrappers/pvactools/combining/comb_rna.py  | 392 ++++++++++++++++++
 .../pvactools/combining/environment.yaml      |  10 +
 .../wrappers/pvactools/combining/wrapper.py   |  85 ++++
 .../wrappers/pvactools/environment.yaml       |   0
 .../wrappers/pvactools/pvacseq/wrapper.py     |   0
 .../wrappers/stringtie/environment.yaml       |   5 +
 snappy_wrappers/wrappers/stringtie/wrapper.py |  47 +++
 ...workflows_somatic_neoepitope_prediction.py | 218 ++++++++++
 10 files changed, 1149 insertions(+), 2 deletions(-)
 create mode 100644 snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
 create mode 100644 snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
 create mode 100644 snappy_wrappers/wrappers/pvactools/combining/environment.yaml
 create mode 100644 snappy_wrappers/wrappers/pvactools/combining/wrapper.py
 create mode 100644 snappy_wrappers/wrappers/pvactools/environment.yaml
 create mode 100644 snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
 create mode 100644 snappy_wrappers/wrappers/stringtie/environment.yaml
 create mode 100644 snappy_wrappers/wrappers/stringtie/wrapper.py
 create mode 100644 tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
new file mode 100644
index 000000000..0d1870dd9
--- /dev/null
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""CUBI Pipeline tumor mutational burden step Snakefile"""
+
+import os
+
+from snappy_pipeline import expand_ref
+from snappy_pipeline.workflows.somatic_neoepitope_prediction import (
+    SomaticNeoepitopePredictionWorkflow,
+)
+
+__author__ = "Pham Gia Cuong"
+
+
+# Configuration ===============================================================
+
+
+configfile: "config.yaml"
+
+
+# Expand "$ref" JSON pointers in configuration (also works for YAML)
+config, lookup_paths, config_paths = expand_ref("config.yaml", config)
+
+# WorkflowImpl Object Setup ===================================================
+wf = SomaticNeoepitopePredictionWorkflow(
+    workflow, config, lookup_paths, config_paths, os.getcwd()
+)
+
+
+localrules:
+    # Linking files from work/ to output/ should be done locally
+    neoepitope_preparation_link_out_run,
+
+
+rule all:
+    input:
+        wf.get_result_files(),
+
+
+# Generic linking out ---------------------------------------------------------
+
+rule neoepitope_preparation_link_out_run:
+    input:
+        wf.get_input_files("link_out", "run"),
+    output:
+        wf.get_output_files("link_out", "run"),
+    run:
+        shell(wf.get_shell_cmd("link_out", "run", wildcards))
+
+# rule rna_pileup:
+#     input:
+#         **wf.get_input_files("rna_pileup","run")
+#     output:
+#         **wf.get_output_files("rna_pileup","run")
+#     log:
+#         **wf.get_log_file("rna_pileup", "run"),
+#     threads:
+#         wf.get_resource("rna_pileup", "run", "threads"),
+#     resources:
+#         time=wf.get_resource("rna_pileup", "run", "time"),
+#         memory=wf.get_resource("rna_pileup", "run", "memory"),
+#         partition=wf.get_resource("rna_pileup", "run", "partition"),
+#         tmpdir=wf.get_resource("rna_pileup", "run", "tmpdir"),
+#     wrapper:
+#         wf.wrapper_path("pvactools/mpileup")
+
+
+rule neoepitope_preparation:
+    input:
+        **wf.get_input_files("neoepitope_preparation", "run"),
+    output:
+        **wf.get_output_files("neoepitope_preparation", "run"),
+    log:
+        **wf.get_log_file("neoepitope_preparation", "run"),
+    threads:wf.get_resource("neoepitope_preparation", "run", "threads"),
+    resources:
+        time=wf.get_resource("neoepitope_preparation", "run", "time"),
+        memory=wf.get_resource("neoepitope_preparation", "run", "memory"),
+        partition=wf.get_resource("neoepitope_preparation", "run", "partition"),
+        tmpdir=wf.get_resource("neoepitope_preparation", "run", "tmpdir"),
+    wrapper:
+        wf.wrapper_path("pvactools/combining")
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index 7486c2e22..69afb23e9 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -31,11 +31,320 @@
 
 """
 
-__author__ = "Manuel Holtgrewe <manuel.holtgrewe@bih-charite.de>"
+from collections import OrderedDict
+import os
+import sys
+
+from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
+from snakemake.io import expand
+
+from snappy_pipeline.utils import dictify, listify
+from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
+from snappy_pipeline.workflows.ngs_mapping import (
+    READ_MAPPERS_RNA,
+    NgsMappingWorkflow,
+    ResourceUsage
+)
+from snappy_pipeline.workflows.somatic_variant_calling import (
+    SOMATIC_VARIANT_CALLERS_MATCHED,
+    SomaticVariantCallingWorkflow,
+)
+from snappy_pipeline.workflows.somatic_variant_annotation import (
+    ANNOTATION_TOOLS,
+    SomaticVariantAnnotationWorkflow
+)
+
+
+__author__ = "Pham Gia Cuong"
+__email__ = "pham.gia-cuong@bih-charite.de"
+
+#: Extensions of files to create as main payload
+EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5")
 
 #: Default configuration for the somatic_gene_fusion_calling step
 DEFAULT_CONFIG = r"""
 step_config:
   somatic_neoepitope_prediction:
-    path_somatic_variant_calling: REQUIRED  # REQUIRED
+    path_somatic_variant_annotation: ../somatic_variant_annotation  # REQUIRED
+    path_rna_ngs_mapping: ../ngs_mapping
+    tools_somatic_variant_annotation: []
+    tools_rna_mapping: [] # deafult to those configured for ngs_mapping
+    tools_ngs_mapping: [] # deafult to those configured for ngs_mapping
+    tools_somatic_variant_calling: [] # deafult to those configured for somatic_variant_calling
+    max_depth: "4000"
+    preparation:
+      format: 'star' # REQUIRED - The file format of the expression file to process. (stringtie,kallisto,cufflinks,custom)
+      # Use `custom` to process file formats not explicitly supported.
+      # The `custom` option requires the use of the --id-column and --expression-column arguments.
+      path_features: ''
+      mode: 'gene'  # REQUIRED
+      id-column: '' # The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.
+      expression-column: 'fr' # REQUIRED
+                              # The column header in the expression_file for
+                              # the column containing expression data.
+                              # Required when using the `custom` and `star` format. For `star`, there are
+                              # only 3 options [unstranded,rf,fr]
+      ignore-ensembl-id-version: True #Ignore the ensemble id version
 """
+
+
+class NeoepitopePreparationStepPart(BaseStepPart):
+    """
+    Preparation VCF file for pvactool
+    """
+    #: Step name
+    name = "neoepitope_preparation"
+    actions = ("run",)
+
+    def __init__(self, parent):
+        super().__init__(parent)
+        # Build shortcut from cancer bio sample name to matched cancer sample
+        self.tumor_ngs_library_to_sample_pair = OrderedDict()
+        for sheet in self.parent.shortcut_sheets:
+            self.tumor_ngs_library_to_sample_pair.update(
+                sheet.all_sample_pairs_by_tumor_dna_ngs_library
+            )
+        # Build mapping from donor name to donor.
+        self.donors = OrderedDict()
+        for sheet in self.parent.shortcut_sheets:
+            for donor in sheet.donors:
+                self.donors[donor.name] = donor
+
+    @dictify
+    def get_input_files(self, action):
+        """Return path to somatic variant annotated file"""
+        # It only works with vep now.
+        # Validate action
+        self._validate_action(action)
+        tpl = (
+            "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/"
+            "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
+        )
+        #Need to change for work on many different tools
+        key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"}
+        variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"]
+        ngs_mapping = self.parent.sub_workflows['ngs_mapping']
+        for key, ext in key_ext.items():
+            yield key, variant_annotation(tpl + ext)
+        #Getting appropriate rna library
+        for sheet in filter(is_not_background, self.parent.sheets):
+            for donor in sheet.bio_entities.values():
+                for biosample in donor.bio_samples.values():
+                    if biosample.extra_infos["isTumor"]:
+                        for test_sample in biosample.test_samples.values():
+                            if test_sample.extra_infos["extractionType"] == "RNA":
+                                for lib in test_sample.ngs_libraries.values():
+                                    rna_library = lib.name
+                                    rna_tpl = (
+                                        "output/{mapper}.{library_name}/out/{mapper}.{library_name}"
+                                    ).format(
+                                        mapper="star",
+                                        library_name=rna_library,
+                                    )
+                                    ext = {"expression","bam","bai"}
+                                    yield 'expression',ngs_mapping(rna_tpl + ".GeneCounts.tab")
+                                    yield 'bam',ngs_mapping(rna_tpl + ".bam")
+                                    yield 'bai',ngs_mapping(rna_tpl + ".bam.bai")
+
+
+    @dictify
+    def get_output_files(self, action):
+        """Return output files """
+        # Validate action
+        # Need to add step for adding RAD also
+        self._validate_action(action)
+        if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene":
+            prefix = (
+                "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
+                "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+            )
+        elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript":
+            prefix = (
+                "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/"
+                "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
+            )
+        key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
+        for key, ext in key_ext.items():
+            yield key, prefix + ext
+            yield key + "_md5", prefix + ext + ".md5"
+
+    @dictify
+    def _get_log_file(self, action):
+        """Return mapping of log files."""
+        # Validate action
+        self._validate_action(action)
+        if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene":
+            prefix = (
+                "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
+                "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+            )
+        elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript":
+            prefix = (
+                "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/"
+                "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
+            )
+        key_ext = (
+            ("log", ".log"),
+            ("conda_info", ".conda_info.txt"),
+            ("conda_list", ".conda_list.txt"),
+        )
+        for key, ext in key_ext:
+            yield key, prefix + ext
+
+    def get_resource_usage(self, action):
+        self._validate_action(action)
+        mem_mb = 6 * 1024  # 6GB
+        return ResourceUsage(
+            threads=2,
+            time="1:00:00",  # 1 hour
+            memory=f"{mem_mb}M",
+        )
+
+    @listify
+    def get_result_files(self):
+        callers = set(
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_somatic_variant_calling"]
+        )
+        anno_callers = set(
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_somatic_variant_annotation"
+            ]
+        )
+        if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene":
+            name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}"
+        elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript":
+            name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}"
+
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"],
+            var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            anno_caller=anno_callers,
+            ext=EXT_VALUES
+        )
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"],
+            var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
+            anno_caller=anno_callers,
+            ext=(
+                ".log",
+                ".log.md5",
+                ".conda_info.txt",
+                ".conda_info.txt.md5",
+                ".conda_list.txt",
+                ".conda_list.txt.md5",
+            ),
+        )
+
+    def _yield_result_files_matched(self, tpl, **kwargs):
+        """Build output paths from path template and extension list.
+
+        This function returns the results from the matched somatic variant callers such as
+        Mutect.
+        """
+        for sheet in filter(is_not_background, self.parent.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if (
+                    not sample_pair.tumor_sample.dna_ngs_library
+                    or not sample_pair.normal_sample.dna_ngs_library
+                    or not sample_pair.tumor_sample.rna_ngs_library
+                ):
+                    msg = (
+                        "INFO: sample pair for cancer bio sample {} has is missing primary"
+                        "normal or primary cancer NGS library"
+                    )
+                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
+                    continue
+                yield from expand(
+                    tpl,
+                    tumor_library=[sample_pair.tumor_sample.dna_ngs_library],
+                    **kwargs,
+                )
+
+
+
+class SomaticNeoepitopePredictionWorkflow(BaseStep):
+    """Perform neoepitope prediction workflow"""
+
+    name = "neoepitope_prediction"
+    sheet_shortcut_class = CancerCaseSheet
+    sheet_shortcut_kwargs = {
+        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
+    }
+
+    @classmethod
+    def default_config_yaml(cls):
+        """Return default config YAML, to be overwritten by project-specific one."""
+        return DEFAULT_CONFIG
+
+    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
+        super().__init__(
+            workflow,
+            config,
+            config_lookup_paths,
+            config_paths,
+            workdir,
+            (SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, NgsMappingWorkflow),
+        )
+        # Register sub step classes so the sub steps are available
+        self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart))
+        self.register_sub_workflow(
+            "somatic_variant_annotation",
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_somatic_variant_annotation"],
+        )
+        self.register_sub_workflow(
+            "ngs_mapping",
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"]
+        )
+        # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_ngs_mapping"
+            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "tools_somatic_variant_calling"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_somatic_variant_calling"
+            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "tools_somatic_variant_annotation"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_somatic_variant_annotation"
+            ] = ["vep"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "tools_rna_mapping"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "tools_rna_mapping"
+            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
+            "path_features"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
+            "path_features"
+        ] = self.w_config["static_data_config"]["features"]["path"]
+
+    def get_result_files(self):
+        for sub_step in self.sub_steps.values():
+            if sub_step.name not in (LinkOutStepPart.name,):
+                yield from sub_step.get_result_files()
+
+    def check_config(self):
+        """Check that the path to the NGS mapping is present"""
+        self.ensure_w_config(
+            ("step_config", "somatic_neoepitope_prediction", "path_somatic_variant_annotation"),
+            "Path to variant (directory of vcf files) not configured but required for somatic neoepitope prediction",
+        )
+
+        self.ensure_w_config(
+            ("step_config", "somatic_neoepitope_prediction", "preparation","mode"),
+            "The mode is required for adding gene expression data to somatic variant annotated file",
+        )
+
+        self.ensure_w_config(
+            ("step_config", "somatic_neoepitope_prediction", "preparation","format"),
+            "Format is required for adding ",
+        )
diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
new file mode 100644
index 000000000..e941e12d2
--- /dev/null
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -0,0 +1,392 @@
+import argparse
+import sys
+import vcfpy
+import pyranges as pr, pandas as pd
+import numpy as np
+import re
+from collections import OrderedDict
+import logging
+
+########################## Reading vcf files ##########################
+def create_vcf_reader(args):
+    vcf_reader = vcfpy.Reader.from_path(args.input_vcf)
+    if args.rna_vcf :
+        rna_vcf_reader = vcfpy.Reader.from_path(args.rna_vcf)
+    else : rna_vcf_reader=None
+    #expression part
+    is_multi_sample = len(vcf_reader.header.samples.names) > 1
+    if is_multi_sample and args.sample_name is None:
+        vcf_reader.close()
+        raise Exception("ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format(args.input_vcf))
+    elif is_multi_sample and args.sample_name not in vcf_reader.header.samples.names:
+        vcf_reader.close()
+        raise Exception("ERROR: VCF {} does not contain a sample column for sample {}.".format(args.input_vcf, args.sample_name))
+    if 'CSQ' not in vcf_reader.header.info_ids():
+        vcf_reader.close()
+        raise Exception("ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format(args.input_vcf))
+    if args.mode == 'gene' and 'GX' in vcf_reader.header.format_ids():
+        vcf_reader.close()
+        raise Exception("ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format(args.input_vcf))
+    elif args.mode == 'transcript' and 'TX' in vcf_reader.header.format_ids():
+        vcf_reader.close()
+        raise Exception("ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format(args.input_vcf))
+    #snv part
+
+    return vcf_reader, rna_vcf_reader, is_multi_sample
+########################## Adding extra fields into Format ##########################
+def create_vcf_writer(args, vcf_reader):
+    (head, sep, tail) = args.input_vcf.rpartition('.vcf')
+    new_header = vcf_reader.header.copy()
+    if args.mode == 'gene':
+        new_header.add_format_line(vcfpy.OrderedDict([('ID', 'GX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Gene Expressions')]))
+        output_file = ('').join([head, '.gx.vcf', tail])
+    elif args.mode == 'transcript':
+        new_header.add_format_line(vcfpy.OrderedDict([('ID', 'TX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Transcript Expressions')]))
+        output_file = ('').join([head, '.tx.vcf', tail])
+    if (args.rna_vcf) :
+        new_header.add_format_line(vcfpy.OrderedDict([
+        ('ID','RAD'),('Number', '.'), ('Type', 'String'),('Description', 'Allelic depths from mRNA sequencing data of the same sample')
+        ]))
+        new_header.add_format_line(vcfpy.OrderedDict([
+        ('ID','ROT'),('Number', '.'), ('Type', 'String'),('Description', 'Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample')
+        ]))
+        (head,sep,tail) = output_file.rpartition('.vcf')
+        output_file = ('').join([head, '.pileup.vcf', tail])
+    if args.output_vcf:
+        output_file = args.output_vcf
+    return vcfpy.Writer.from_path(output_file, new_header)
+
+########################## Auto assign gene id column based on different tools ##########################
+def resolve_id_column(args):
+    if args.format == 'cufflinks':
+        return 'tracking_id'
+    elif args.format == 'kallisto':
+        if args.mode == 'gene':
+            return 'gene'
+        elif args.mode == 'transcript':
+            return 'target_id'
+    elif args.format == 'stringtie':
+        if args.mode == 'gene':
+            return 'Gene ID'
+    elif args.format == 'star':
+        return "gene_id"                  # NEED change
+    elif args.format == 'custom':
+        if args.id_column is None:
+            raise Exception("ERROR: `--id-column` option is required when using the `custom` format")
+        else:
+            return args.id_column
+########################## Auto assign expression column based on different tools ##########################
+def resolve_expression_column(args):
+    if args.format == 'cufflinks':
+        return 'FPKM'
+    elif args.format == 'kallisto':
+        if args.mode == 'gene':
+            return 'abundance'
+        elif args.mode == 'transcript':
+            return 'tpm'
+    elif args.format == 'stringtie':
+        return 'TPM'
+    elif args.format == 'star':
+        if args.expression_column is None:
+            raise Exception("ERROR: `--expression-column` option is required when using the `star` format")
+        else:
+            return 'TPM_' + args.expression_column
+    elif (args.format == 'custom'):
+        if args.expression_column is None:
+            raise Exception("ERROR: `--expression-column` option is required when using the `custom` format")
+        else:
+            return args.expression_column
+
+def to_array(dictionary):
+    array = []
+    for key, value in dictionary.items():
+        array.append("{}|{}".format(key, value))
+    return sorted(array)
+
+def resolve_stringtie_id_column(args, headers):
+    if 'gene_id' in headers:
+        return 'gene_id'
+    else:
+        return 'transcript_id'
+
+#Star TPM calculation
+#If there is genecount file, GENECODE file is required to calculate TPM
+##########################Star TPM Calculation##########################
+def genecount_reader(genecount_path):
+    col_name=["gene_id","unstranded","rf","fr"]
+    star_genecount = pd.read_csv(genecount_path,
+                            skiprows=4,
+                            sep="\t",
+                            names=col_name,
+                            header=None,
+                            ).sort_values(by=["gene_id"])
+    #star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0])
+    star_genecount.unstranded=star_genecount.unstranded.astype(int,copy=None)
+    star_genecount.rf=star_genecount.rf.astype(int,copy=None)
+    star_genecount.fr=star_genecount.fr.astype(int,copy=None)
+    #star_genecount.set_index("gene_id")
+    return star_genecount
+
+def genecode_reader(genecode_path,star_genecount,expression_column):
+    gc = pr.read_gtf(genecode_path,as_df=True)
+    gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])]
+    gc = gc[(gc.Feature=="gene")]
+    exon = gc[["Chromosome","Start","End","gene_id","gene_name"]]
+    # Convert columns to proper types.
+    exon.Start = exon.Start.astype(int)
+    exon.End = exon.End.astype(int)
+    # Sort in place.
+    exon.sort_values(by=['Chromosome','Start','End'], inplace=True)
+    # Group the rows by the Ensembl gene identifier (with version numbers.)
+    groups = exon.groupby('gene_id')
+
+    lengths = groups.apply(count_bp)
+    # Create a new DataFrame with gene lengths and EnsemblID.
+    ensembl_no_version = lengths.index
+    ldf = pd.DataFrame({'length': lengths.values,
+                        'gene_id': ensembl_no_version})
+    star_tpm = pd.merge(ldf,star_genecount, how='inner',on='gene_id')
+    star_tpm["TPM_" + expression_column]= calculate_TPM(star_tpm,expression_column)
+    return star_tpm
+
+def calculate_TPM(df,strand):
+    effLen = (df["length"])/1000
+    rate = df[strand]/effLen
+    denom = sum(rate)/(10**6)
+    return rate/denom
+
+def count_bp(df):
+    """Given a DataFrame with the exon coordinates from Gencode for a single
+    gene, return the total number of coding bases in that gene.
+    Example:
+        >>> import numpy as np
+        >>> n = 3
+        >>> r = lambda x: np.random.sample(x) * 10
+        >>> d = pd.DataFrame([np.sort([a,b]) for a,b in zip(r(n), r(n))], columns=['start','end']).astype(int)
+        >>> d
+           start  end
+        0      6    9
+        1      3    4
+        2      4    9
+        >>> count_bp(d)
+        7
+    Here is a visual representation of the 3 exons and the way they are added:
+          123456789  Length
+        0      ----       4
+        1   --            2
+        2    ------       6
+            =======       7
+    """
+    start = df.Start.min()
+    end = df.End.max()
+    bp = [False] * (end - start + 1)
+    for i in range(df.shape[0]):
+        s = df.iloc[i]['Start'] - start
+        e = df.iloc[i]['End'] - start + 1
+        bp[s:e] = [True] * (e - s)
+    return sum(bp)
+
+##########################Passing expression file##########################
+def parse_expression_file(args, vcf_reader, vcf_writer):
+    if args.format == 'stringtie' and args.mode == 'transcript':
+        df_all = pr.read_gtf(args.expression_file)
+        df = df_all[df_all["feature"] == "transcript"]
+        id_column = resolve_stringtie_id_column(args, df.columns.values)
+    elif args.format == "star" :
+        id_column = resolve_id_column(args)
+        star_genecount = genecount_reader(args.expression_file)
+        df = genecode_reader(args.genecode,star_genecount,args.expression_column)
+    else:
+        id_column = resolve_id_column(args)
+        df = pd.read_csv(args.expression_file, sep='\t')
+    if args.ignore_ensembl_id_version:
+        df['transcript_without_version'] = df[id_column].apply(lambda x: re.sub(r'\.[0-9]+$', '', x))
+    expression_column = resolve_expression_column(args)
+    if expression_column not in df.columns.values:
+        vcf_reader.close()
+        vcf_writer.close()
+        raise Exception("ERROR: expression_column header {} does not exist in expression_file {}".format(expression_column, args.expression_file))
+    if id_column not in df.columns.values:
+        vcf_reader.close()
+        vcf_writer.close()
+        raise Exception("ERROR: id_column header {} does not exist in expression_file {}".format(id_column, args.expression_file))
+    return df, id_column, expression_column
+
+##########################Adding RNA AD ##########################
+def add_AD_rna_file(entry,is_multi_sample, sample_name,rna_vcf):
+    RAD_temp=[]
+    ROT = []
+    for rna_record in rna_vcf.fetch(entry.CHROM,entry.affected_start,entry.affected_end):
+        rna_alt= [alt.value for alt in rna_record.ALT]
+        rna_AD = [c.data.get('AD') for c in rna_record.calls][0]
+        ROT_temp=sum(rna_AD) - rna_AD[0]
+        if not rna_alt:
+            continue
+        else:
+            rna_AD = [c.data.get('AD') for c in rna_record.calls][0]
+            for dna_alt in entry.ALT:
+                if dna_alt.value in rna_alt:
+                    index=rna_record.ALT.index(dna_alt)
+                    RAD_temp.append(rna_AD[0])   #AD of REF
+                    RAD_temp.append(rna_AD[index+1]) #AD of ALT
+                    ROT_temp = ROT_temp - rna_AD[index+1]
+        ROT.append(ROT_temp)
+        if is_multi_sample:
+            entry.FORMAT += ['RAD']
+            entry.call_for_sample[sample_name].data['RAD'] = RAD_temp
+            entry.FORMAT += ['ROT']
+            entry.call_for_sample[sample_name].data['ROT'] = ROT
+        else:
+            entry.add_format('RAD', RAD_temp)
+            entry.add_format('ROT', ROT)
+    return entry
+
+##########################Adding Expression file ##########################
+def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_column, expression_column, ignore_ensembl_id_version, missing_expressions_count, entry_count):
+    expressions = {}
+    for item in items:
+        entry_count += 1
+        if ignore_ensembl_id_version:
+            subset = df.loc[df['transcript_without_version'] == re.sub(r'\.[0-9]+$', '', item)]
+        else:
+            subset = df.loc[df[id_column] == item]
+        if len(subset) > 0:
+            expressions[item] = subset[expression_column].sum()
+        else:
+            missing_expressions_count += 1
+    if is_multi_sample:
+        entry.FORMAT += [tag]
+        entry.call_for_sample[sample_name].data[tag] = to_array(expressions)
+    else:
+        entry.add_format(tag, to_array(expressions))
+    return (entry, missing_expressions_count, entry_count)
+
+########################## Define tool parameters ##########################
+def define_parser():
+    parser = argparse.ArgumentParser(
+        "comb_rna.py",
+        description = "A tool that will add the data from several expression tools' output files" +
+                      "and allelic depths from mRNA sequencing data for snvs" +
+                      "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star" +
+                      "and Cufflinks. There also is a ``custom`` option to annotate with data " +
+                      "from any tab-delimited file."
+    )
+
+    parser.add_argument(
+        "--input_vcf",
+        help="A VEP-annotated VCF file"
+    )
+    parser.add_argument(
+        "--expression_file",
+        help="A TSV file containing expression estimates"
+    )
+    parser.add_argument(
+        "--genecode",
+        help="A genecode file for calculate TPM from star gene count"
+    )
+    parser.add_argument(
+        "--rna-vcf",
+        help="A VCF file of RNA-seq data",
+    )
+    parser.add_argument(
+        "--format",
+        choices=['kallisto','stringtie','cufflinks','star','custom'],
+        help="The file format of the expression file to process. "
+            +"Use `custom` to process file formats not explicitly supported. "
+            +"The `custom` option requires the use of the --id-column and --expression-column arguments."
+    )
+    parser.add_argument(
+        "--mode",
+        choices=['gene', 'transcript'],
+        help="The type of expression data in the expression_file"
+    )
+    parser.add_argument(
+        "-i", "--id-column",
+        help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format."
+    )
+    parser.add_argument(
+        "-e", "--expression-column",
+        help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format."
+    )
+    parser.add_argument(
+        "-s", "--sample-name",
+        help="If the input_vcf contains multiple samples, the name of the sample to annotate."
+    )
+    parser.add_argument(
+        "-o", "--output-vcf",
+        help="Path to write the output VCF file. If not provided, the output VCF file will be "
+            +"written next to the input VCF file with a .tx.vcf or .gx.vcf file ending."
+    )
+    parser.add_argument(
+        "--ignore-ensembl-id-version",
+        help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").',
+        action="store_true"
+    )
+
+    return parser
+
+def main(args_input = sys.argv[1:]):
+    parser = define_parser()
+    args = parser.parse_args(args_input)
+    CONTIGS = ['chr{}'.format(i) for i in range(1, 23)] + ['chrX', 'chrY']
+    if args.format == 'custom':
+        if args.id_column is None:
+            raise Exception("--id-column is not set. This is required when using the `custom` format.")
+        if args.expression_column is None:
+            raise Exception("--expression-column is not set. This is required when using the `custom` format.")
+    if args.format == 'star':
+        if args.genecode is None:
+            raise Exception("--genecode is not set. This is required when using the `star` format")
+        exp_col = ["unstranded","rf","fr"]
+        if args.expression_column not in exp_col:
+            raise Exception("""
+                            --expression-column is not found. Please choose :
+                            unstranded - Assumes a unstranded library.
+                            rf - Assumes a stranded library fr-firststrand.
+                            fr - Assumes a stranded library fr-secondstrand.
+                            """)
+    (vcf_reader,rna_vcf_reader, is_multi_sample) = create_vcf_reader(args)
+    format_pattern = re.compile('Format: (.*)')
+    csq_format = format_pattern.search(vcf_reader.header.get_info_field_info('CSQ').description).group(1).split('|')
+
+    vcf_writer = create_vcf_writer(args, vcf_reader)
+    (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer)
+    missing_expressions_count = 0
+    entry_count = 0
+    for entry in vcf_reader:
+        #Add expression data
+        transcript_ids = set()
+        genes = set()
+        if 'CSQ' not in entry.INFO:
+            logging.warning("Variant is missing VEP annotation. INFO column doesn't contain CSQ field for variant {}".format(entry))
+            vcf_writer.write_record(entry)
+            continue
+        for transcript in entry.INFO['CSQ']:
+            for key, value in zip(csq_format, transcript.split('|')):
+                if key == 'Feature' and value != '' and not value.startswith('ENSR'):
+                    transcript_ids.add(value)
+                if key == 'Gene' and value != '':
+                    genes.add(value)
+
+        if args.mode == 'gene':
+            genes = list(genes)
+            if len(genes) > 0:
+                (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, genes, 'GX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count)
+        elif args.mode == 'transcript':
+            transcript_ids = list(transcript_ids)
+            if len(transcript_ids) > 0:
+                (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, transcript_ids, 'TX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count)
+        #Add RAD and ROT
+        if  rna_vcf_reader is not None:
+            if (entry.CHROM in CONTIGS) and (entry.is_snv()):
+                entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name,rna_vcf_reader)
+        vcf_writer.write_record(entry)
+
+    vcf_reader.close()
+    vcf_writer.close()
+
+    if missing_expressions_count > 0:
+        logging.warning("{} of {} {}s did not have an expression entry for their {} id.".format(missing_expressions_count, entry_count, args.mode, args.mode))
+
+if __name__ == '__main__':
+    main()
diff --git a/snappy_wrappers/wrappers/pvactools/combining/environment.yaml b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml
new file mode 100644
index 000000000..2ea441ab8
--- /dev/null
+++ b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml
@@ -0,0 +1,10 @@
+channels:
+    - conda-forge
+    - bioconda
+dependencies:
+    - vcfpy
+    - numpy
+    - pyranges
+    - pandas
+    - python
+    - bcftools
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
new file mode 100644
index 000000000..1dd23a267
--- /dev/null
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+"""
+Wrapper for preparation the vcf file for somatic neoepitope prediction
+"""
+
+import os
+from snakemake.shell import shell
+
+__author__ = "Pham Gia Cuong"
+__email__ = "pham.gia-cuong@bih-charite.de"
+
+step = snakemake.config["pipeline_step"]["name"]
+config = snakemake.config["step_config"][step]
+
+exp_format = config['preparation']['format']
+preparation_vcf = os.path.join(
+    os.path.dirname(__file__),
+    "comb_rna.py",
+)
+ensemble_id = (
+    "--ignore-ensembl-id-version"
+    if config['preparation']['ignore-ensembl-id-version']
+    else ""
+)
+
+shell(
+    r"""
+# -----------------------------------------------------------------------------
+# Redirect stderr to log file by default and enable printing executed commands
+# Also pipe stderr to log file
+if [[ -n "{snakemake.log.log}" ]]; then
+    if [[ "$(set +e; tty; set -e)" != "" ]]; then
+        rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
+        exec 2> >(tee -a "{snakemake.log.log}" >&2)
+    else
+        rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
+        echo "No tty, logging disabled" >"{snakemake.log.log}"
+    fi
+fi
+# -----------------------------------------------------------------------------
+# Create auto-cleaned temporary directory
+export TMPDIR=$(mktemp -d)
+trap "rm -rf $TMPDIR" EXIT
+
+set -x
+# -----------------------------------------------------------------------------
+
+# Write out information about conda installation
+conda list > {snakemake.log.conda_list}
+conda info > {snakemake.log.conda_info}
+
+# Getting region of SNVs only
+bcftools filter -i 'TYPE="snp"' {snakemake.input.vcf} | bcftools query -f '%CHROM\t%POS0\t%END\n' > $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed
+
+# Getting snvs from RNA sequencing data
+bcftools mpileup -Ov \
+    --annotate FORMAT/AD,FORMAT/DP \
+    -R $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed \
+    -f {snakemake.config[static_data_config][reference][path]} \
+    --per-sample-mF \
+    --max-depth 4000 \
+    --redo-BAQ -Oz \
+    -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \
+    {snakemake.input.bam}
+tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz
+
+#Need to add option for RNA vcf as well
+python3 -W ignore {preparation_vcf} \
+    --genecode {snakemake.config[static_data_config][features][path]} \
+    --input_vcf {snakemake.input.vcf} --format {config[preparation][format]} \
+    --rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \
+    --expression_file {snakemake.input.expression} \
+    --mode {config[preparation][mode]} \
+    -e {config[preparation][expression-column]} \
+    -s {snakemake.wildcards.tumor_library} \
+    -o /dev/stdout \
+    "{ensemble_id}" \
+| bgzip -c > {snakemake.output.vcf}
+tabix -f {snakemake.output.vcf}
+
+pushd $(dirname {snakemake.output.vcf})
+md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5
+md5sum $(basename {snakemake.output.vcf_tbi}) >$(basename {snakemake.output.vcf_tbi}).md5
+"""
+)
diff --git a/snappy_wrappers/wrappers/pvactools/environment.yaml b/snappy_wrappers/wrappers/pvactools/environment.yaml
new file mode 100644
index 000000000..e69de29bb
diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/snappy_wrappers/wrappers/stringtie/environment.yaml b/snappy_wrappers/wrappers/stringtie/environment.yaml
new file mode 100644
index 000000000..b79f83c81
--- /dev/null
+++ b/snappy_wrappers/wrappers/stringtie/environment.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - stringtie==2.2.1
\ No newline at end of file
diff --git a/snappy_wrappers/wrappers/stringtie/wrapper.py b/snappy_wrappers/wrappers/stringtie/wrapper.py
new file mode 100644
index 000000000..f71436713
--- /dev/null
+++ b/snappy_wrappers/wrappers/stringtie/wrapper.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""
+Wrapper for quantify gene and transcript expression
+"""
+
+from snakemake.shell import shell
+
+__author__ = "Pham Gia Cuong"
+__email__ = "pham.gia-cuong@bih-charite.de"
+
+step = snakemake.config["pipeline_step"]["name"]
+config = snakemake.config["step_config"][step]
+
+shell(
+    r"""
+# -----------------------------------------------------------------------------
+# Redirect stderr to log file by default and enable printing executed commands
+exec 2> >(tee -a "{snakemake.log.log}")
+set -x
+# -----------------------------------------------------------------------------
+
+# Write out information about conda installation
+conda list > {snakemake.log.conda_list}
+conda info > {snakemake.log.conda_info}
+
+
+stringtie \
+    -G {snakemake.config[static_data_config][features][path]} \
+    -p 4 \
+    -v \
+    -o {snakemake.output.expression} \
+    {snakemake.input.rna_bam}
+
+pushd $(dirname {snakemake.output.expression})
+md5sum $(basename {snakemake.output.expression}) > $(basename {snakemake.output.expression_md5})
+"""
+)
+
+
+# Compute MD5 sums of logs
+shell(
+    r"""
+md5sum {snakemake.log.log} > {snakemake.log.log_md5}
+md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5}
+md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5}
+"""
+)
\ No newline at end of file
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
new file mode 100644
index 000000000..eac6138d7
--- /dev/null
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+"""Tests for the somatic_variant_calling workflow module code"""
+
+import textwrap
+
+import pytest
+import ruamel.yaml as ruamel_yaml
+
+from snappy_pipeline.workflows.somatic_neoepitope_prediction import (
+    SomaticNeoepitopePredictionWorkflow,
+)
+
+from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict
+from .conftest import patch_module_fs
+
+
+# Test tumor mutational burden calculation with vcf file from somatic variant calling step
+@pytest.fixture(scope="module")  # otherwise: performance issues
+def minimal_config():
+    """Return YAML parsing result for configuration"""
+    yaml = ruamel_yaml.YAML()
+    return yaml.load(
+        textwrap.dedent(
+            r"""
+        static_data_config:
+          reference:
+            path: /path/to/ref.fa
+          cosmic:
+            path: /path/to/cosmic.vcf.gz
+          dbsnp:
+            path: /path/to/dbsnp.vcf.gz
+          features:
+            path: /path/to/genecode.gtf
+
+        step_config:
+          ngs_mapping:
+            tools:
+              dna: ['bwa']
+              rna: ['star']
+            star:
+              path_index: /path/to/index
+              path_features:
+            bwa:
+              path_index: /path/to/bwa/index.fa
+
+          somatic_variant_calling:
+            tools:
+            - mutect2
+            - scalpel
+            scalpel:
+              path_target_regions: /path/to/target/regions.bed
+
+          somatic_variant_annotation:
+            tools: ["vep"]
+            vep:
+              path_dir_cache: /path/to/dir/cache
+
+          somatic_neoepitope_prediction:
+            path_somatic_variant_annotation: ../somatic_variant_annotation
+            path_rna_ngs_mapping: ../ngs_mapping
+            tools_somatic_variant_annotation: []
+            tools_rna_mapping: []
+            tools_ngs_mapping: []
+            tools_somatic_variant_calling: []
+            max_depth: "4000"
+            preparation:
+                format: 'star'
+                path_features: ''
+                mode: 'gene'
+                id-column: ''
+                expression-column: 'fr'
+                ignore-ensembl-id-version: True
+
+        data_sets:
+          first_batch:
+            file: sheet.tsv
+            search_patterns:
+            - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'}
+            search_paths: ['/path']
+            type: matched_cancer
+            naming_scheme: only_secondary_id
+        """
+        ).lstrip()
+    )
+
+@pytest.fixture
+def somatic_neoepitope_prediction_workflow(
+    dummy_workflow,
+    minimal_config,
+    config_lookup_paths,
+    work_dir,
+    config_paths,
+    cancer_sheet_fake_fs,
+    mocker,
+):
+    # Patch out file-system related things in abstract (the crawling link in step is defined there)
+    patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker)
+    # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we
+    # can obtain paths from the function as if we really had a NGSMappingPipelineStep there
+    dummy_workflow.globals = {
+        "ngs_mapping": lambda x: "NGS_MAPPING/" + x,
+        "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x,
+    }
+    # Construct the workflow object
+    return SomaticNeoepitopePredictionWorkflow(
+        dummy_workflow,
+        minimal_config,
+        config_lookup_paths,
+        config_paths,
+        work_dir,
+    )
+
+def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_prediction_workflow):
+    # Define expected
+    vcf_base_out = (
+        "SOMATIC_VARIANT_ANNOTATION/output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/"
+        "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
+    )
+    ngs_mapping_base_out = "NGS_MAPPING/output/star.P002-T2-RNA1-mRNA_seq1/out/"
+    expected = {
+        "vcf": vcf_base_out + ".full.vcf.gz",
+        "vcf_tbi": vcf_base_out + ".full.vcf.gz.tbi",
+        "expression": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.GeneCounts.tab",
+        "bam": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam",
+        "bai": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam.bai",
+    }
+
+    # Get actual
+    actual = somatic_neoepitope_prediction_workflow.get_input_files("neoepitope_preparation", "run")
+    assert actual == expected
+
+def test_neoepitope_preparation_step_part_get_output_files(somatic_neoepitope_prediction_workflow):
+    base_out = (
+                "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
+                "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+            )
+    expected = get_expected_output_vcf_files_dict(base_out)
+    actual = somatic_neoepitope_prediction_workflow.get_output_files("neoepitope_preparation", "run")
+    assert actual == expected
+
+def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_prediction_workflow):
+    base_out = (
+        "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
+                "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+    )
+    expected = get_expected_log_files_dict(base_out=base_out)
+    actual = somatic_neoepitope_prediction_workflow.get_log_file("neoepitope_preparation", "run")
+    assert actual == expected
+
+def test_neoepitope_preparation_step_part_get_resource_usage(somatic_neoepitope_prediction_workflow):
+    # Define expected
+    expected_dict = {"threads": 2, "time": "1:00:00", "memory": "6144M"}
+    # Evaluate
+    for resource, expected in expected_dict.items():
+        msg_error = f"Assertion error for resource '{resource}'."
+        actual = somatic_neoepitope_prediction_workflow.get_resource("neoepitope_preparation", "run", resource)
+        assert actual == expected, msg_error
+
+def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_workflow):
+    """Test simple functionality of the workflow"""
+    # Check created sub steps
+    expected = ["link_out", "neoepitope_preparation"]
+    actual = list(sorted(somatic_neoepitope_prediction_workflow.sub_steps.keys()))
+    assert actual == expected
+
+    # Check result file construction
+    tpl = (
+        "output/{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1/{dir_}/"
+        "{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1.{ext}"
+    )
+    expected = [
+        tpl.format(
+            mapper=mapper,
+            var_caller=var_caller,
+            anno_caller=anno_caller,
+            i=i,
+            t=t,
+            ext=ext,
+            dir_="out",
+        )
+        for i,t in ((1, 1), (2, 2))
+        for ext in (
+            "vcf.gz",
+            "vcf.gz.tbi",
+            "vcf.gz.md5",
+            "vcf.gz.tbi.md5"
+        )
+        for mapper in("bwa",)
+        for var_caller in ("mutect2", "scalpel")
+        for anno_caller in ("vep",)
+    ]
+
+    expected += [
+        tpl.format(
+            mapper=mapper,
+            var_caller=var_caller,
+            anno_caller=anno_caller,
+            i=i,
+            t=t,
+            ext=ext,
+            dir_="log",
+        )
+        for i,t in ((1, 1), (2, 2))
+        for ext in (
+            "conda_info.txt",
+            "conda_list.txt",
+            "log",
+            "conda_info.txt.md5",
+            "conda_list.txt.md5",
+            "log.md5",
+        )
+        for mapper in("bwa",)
+        for var_caller in ("mutect2", "scalpel")
+        for anno_caller in ("vep",)
+    ]
+    expected = list(sorted(expected))
+    actual = list(sorted(somatic_neoepitope_prediction_workflow.get_result_files()))
+    assert expected == actual

From 4eab17a80b7dfedf26ffe013d811a6dc25d60228 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 4 Apr 2024 13:20:58 +0200
Subject: [PATCH 02/23] Satisfying code format

---
 .../somatic_neoepitope_prediction/Snakefile   |  20 +-
 .../somatic_neoepitope_prediction/__init__.py | 135 +++--
 .../wrappers/pvactools/combining/comb_rna.py  | 521 ++++++++++++------
 .../wrappers/pvactools/combining/wrapper.py   |   4 +-
 4 files changed, 438 insertions(+), 242 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
index 0d1870dd9..0fcffcce3 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
@@ -38,6 +38,7 @@ rule all:
 
 # Generic linking out ---------------------------------------------------------
 
+
 rule neoepitope_preparation_link_out_run:
     input:
         wf.get_input_files("link_out", "run"),
@@ -46,23 +47,6 @@ rule neoepitope_preparation_link_out_run:
     run:
         shell(wf.get_shell_cmd("link_out", "run", wildcards))
 
-# rule rna_pileup:
-#     input:
-#         **wf.get_input_files("rna_pileup","run")
-#     output:
-#         **wf.get_output_files("rna_pileup","run")
-#     log:
-#         **wf.get_log_file("rna_pileup", "run"),
-#     threads:
-#         wf.get_resource("rna_pileup", "run", "threads"),
-#     resources:
-#         time=wf.get_resource("rna_pileup", "run", "time"),
-#         memory=wf.get_resource("rna_pileup", "run", "memory"),
-#         partition=wf.get_resource("rna_pileup", "run", "partition"),
-#         tmpdir=wf.get_resource("rna_pileup", "run", "tmpdir"),
-#     wrapper:
-#         wf.wrapper_path("pvactools/mpileup")
-
 
 rule neoepitope_preparation:
     input:
@@ -71,7 +55,7 @@ rule neoepitope_preparation:
         **wf.get_output_files("neoepitope_preparation", "run"),
     log:
         **wf.get_log_file("neoepitope_preparation", "run"),
-    threads:wf.get_resource("neoepitope_preparation", "run", "threads"),
+    threads: wf.get_resource("neoepitope_preparation", "run", "threads")
     resources:
         time=wf.get_resource("neoepitope_preparation", "run", "time"),
         memory=wf.get_resource("neoepitope_preparation", "run", "memory"),
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index 69afb23e9..6366c5577 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -35,23 +35,25 @@
 import os
 import sys
 
-from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
+from biomedsheets.shortcuts import (
+    CancerCaseSheet,
+    CancerCaseSheetOptions,
+    is_not_background,
+)
 from snakemake.io import expand
 
 from snappy_pipeline.utils import dictify, listify
 from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
 from snappy_pipeline.workflows.ngs_mapping import (
-    READ_MAPPERS_RNA,
     NgsMappingWorkflow,
-    ResourceUsage
+    ResourceUsage,
 )
 from snappy_pipeline.workflows.somatic_variant_calling import (
     SOMATIC_VARIANT_CALLERS_MATCHED,
     SomaticVariantCallingWorkflow,
 )
 from snappy_pipeline.workflows.somatic_variant_annotation import (
-    ANNOTATION_TOOLS,
-    SomaticVariantAnnotationWorkflow
+    SomaticVariantAnnotationWorkflow,
 )
 
 
@@ -92,6 +94,7 @@ class NeoepitopePreparationStepPart(BaseStepPart):
     """
     Preparation VCF file for pvactool
     """
+
     #: Step name
     name = "neoepitope_preparation"
     actions = ("run",)
@@ -120,13 +123,13 @@ def get_input_files(self, action):
             "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/"
             "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
         )
-        #Need to change for work on many different tools
+        # Need to change for work on many different tools
         key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"}
         variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"]
-        ngs_mapping = self.parent.sub_workflows['ngs_mapping']
+        ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
         for key, ext in key_ext.items():
             yield key, variant_annotation(tpl + ext)
-        #Getting appropriate rna library
+        # Getting appropriate rna library
         for sheet in filter(is_not_background, self.parent.sheets):
             for donor in sheet.bio_entities.values():
                 for biosample in donor.bio_samples.values():
@@ -141,24 +144,35 @@ def get_input_files(self, action):
                                         mapper="star",
                                         library_name=rna_library,
                                     )
-                                    ext = {"expression","bam","bai"}
-                                    yield 'expression',ngs_mapping(rna_tpl + ".GeneCounts.tab")
-                                    yield 'bam',ngs_mapping(rna_tpl + ".bam")
-                                    yield 'bai',ngs_mapping(rna_tpl + ".bam.bai")
-
+                                    ext = {"expression", "bam", "bai"}
+                                    yield "expression", ngs_mapping(
+                                        rna_tpl + ".GeneCounts.tab"
+                                    )
+                                    yield "bam", ngs_mapping(rna_tpl + ".bam")
+                                    yield "bai", ngs_mapping(rna_tpl + ".bam.bai")
 
     @dictify
     def get_output_files(self, action):
-        """Return output files """
+        """Return output files"""
         # Validate action
         # Need to add step for adding RAD also
         self._validate_action(action)
-        if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene":
+        if (
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "preparation"
+            ]["mode"]
+            == "gene"
+        ):
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
-        elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript":
+        elif (
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "preparation"
+            ]["mode"]
+            == "transcript"
+        ):
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/"
                 "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
@@ -173,12 +187,22 @@ def _get_log_file(self, action):
         """Return mapping of log files."""
         # Validate action
         self._validate_action(action)
-        if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene":
+        if (
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "preparation"
+            ]["mode"]
+            == "gene"
+        ):
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
-        elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript":
+        elif (
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "preparation"
+            ]["mode"]
+            == "transcript"
+        ):
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/"
                 "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
@@ -203,28 +227,44 @@ def get_resource_usage(self, action):
     @listify
     def get_result_files(self):
         callers = set(
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_somatic_variant_calling"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_somatic_variant_calling"
+            ]
         )
         anno_callers = set(
             self.w_config["step_config"]["somatic_neoepitope_prediction"][
                 "tools_somatic_variant_annotation"
             ]
         )
-        if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene":
+        if (
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "preparation"
+            ]["mode"]
+            == "gene"
+        ):
             name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}"
-        elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript":
+        elif (
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "preparation"
+            ]["mode"]
+            == "transcript"
+        ):
             name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}"
 
         yield from self._yield_result_files_matched(
             os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
-            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"],
+            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_ngs_mapping"
+            ],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
-            ext=EXT_VALUES
+            ext=EXT_VALUES,
         )
         yield from self._yield_result_files_matched(
             os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
-            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"],
+            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_ngs_mapping"
+            ],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
             ext=(
@@ -263,14 +303,15 @@ def _yield_result_files_matched(self, tpl, **kwargs):
                 )
 
 
-
 class SomaticNeoepitopePredictionWorkflow(BaseStep):
     """Perform neoepitope prediction workflow"""
 
     name = "neoepitope_prediction"
     sheet_shortcut_class = CancerCaseSheet
     sheet_shortcut_kwargs = {
-        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
+        "options": CancerCaseSheetOptions(
+            allow_missing_normal=True, allow_missing_tumor=True
+        )
     }
 
     @classmethod
@@ -285,20 +326,30 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             config_lookup_paths,
             config_paths,
             workdir,
-            (SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, NgsMappingWorkflow),
+            (
+                SomaticVariantCallingWorkflow,
+                SomaticVariantAnnotationWorkflow,
+                NgsMappingWorkflow,
+            ),
         )
         # Register sub step classes so the sub steps are available
         self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart))
         self.register_sub_workflow(
             "somatic_variant_annotation",
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_somatic_variant_annotation"],
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "path_somatic_variant_annotation"
+            ],
         )
         self.register_sub_workflow(
             "ngs_mapping",
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "path_rna_ngs_mapping"
+            ],
         )
         # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]:
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "tools_ngs_mapping"
+        ]:
             self.w_config["step_config"]["somatic_neoepitope_prediction"][
                 "tools_ngs_mapping"
             ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]
@@ -318,14 +369,14 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             "tools_rna_mapping"
         ]:
             self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "tools_rna_mapping"
+                "tools_rna_mapping"
             ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
-            "path_features"
-        ]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
-            "path_features"
-        ] = self.w_config["static_data_config"]["features"]["path"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "preparation"
+        ]["path_features"]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "preparation"
+            ]["path_features"] = self.w_config["static_data_config"]["features"]["path"]
 
     def get_result_files(self):
         for sub_step in self.sub_steps.values():
@@ -335,16 +386,20 @@ def get_result_files(self):
     def check_config(self):
         """Check that the path to the NGS mapping is present"""
         self.ensure_w_config(
-            ("step_config", "somatic_neoepitope_prediction", "path_somatic_variant_annotation"),
+            (
+                "step_config",
+                "somatic_neoepitope_prediction",
+                "path_somatic_variant_annotation",
+            ),
             "Path to variant (directory of vcf files) not configured but required for somatic neoepitope prediction",
         )
 
         self.ensure_w_config(
-            ("step_config", "somatic_neoepitope_prediction", "preparation","mode"),
+            ("step_config", "somatic_neoepitope_prediction", "preparation", "mode"),
             "The mode is required for adding gene expression data to somatic variant annotated file",
         )
 
         self.ensure_w_config(
-            ("step_config", "somatic_neoepitope_prediction", "preparation","format"),
+            ("step_config", "somatic_neoepitope_prediction", "preparation", "format"),
             "Format is required for adding ",
         )
diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
index e941e12d2..ef40f0151 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -1,159 +1,232 @@
 import argparse
 import sys
 import vcfpy
-import pyranges as pr, pandas as pd
-import numpy as np
+import pyranges as pr
+import pandas as pd
 import re
-from collections import OrderedDict
 import logging
 
+
 ########################## Reading vcf files ##########################
 def create_vcf_reader(args):
     vcf_reader = vcfpy.Reader.from_path(args.input_vcf)
-    if args.rna_vcf :
+    if args.rna_vcf:
         rna_vcf_reader = vcfpy.Reader.from_path(args.rna_vcf)
-    else : rna_vcf_reader=None
-    #expression part
+    else:
+        rna_vcf_reader = None
+    # expression part
     is_multi_sample = len(vcf_reader.header.samples.names) > 1
     if is_multi_sample and args.sample_name is None:
         vcf_reader.close()
-        raise Exception("ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format(args.input_vcf))
+        raise Exception(
+            "ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format(
+                args.input_vcf
+            )
+        )
     elif is_multi_sample and args.sample_name not in vcf_reader.header.samples.names:
         vcf_reader.close()
-        raise Exception("ERROR: VCF {} does not contain a sample column for sample {}.".format(args.input_vcf, args.sample_name))
-    if 'CSQ' not in vcf_reader.header.info_ids():
+        raise Exception(
+            "ERROR: VCF {} does not contain a sample column for sample {}.".format(
+                args.input_vcf, args.sample_name
+            )
+        )
+    if "CSQ" not in vcf_reader.header.info_ids():
         vcf_reader.close()
-        raise Exception("ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format(args.input_vcf))
-    if args.mode == 'gene' and 'GX' in vcf_reader.header.format_ids():
+        raise Exception(
+            "ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format(
+                args.input_vcf
+            )
+        )
+    if args.mode == "gene" and "GX" in vcf_reader.header.format_ids():
         vcf_reader.close()
-        raise Exception("ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format(args.input_vcf))
-    elif args.mode == 'transcript' and 'TX' in vcf_reader.header.format_ids():
+        raise Exception(
+            "ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format(
+                args.input_vcf
+            )
+        )
+    elif args.mode == "transcript" and "TX" in vcf_reader.header.format_ids():
         vcf_reader.close()
-        raise Exception("ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format(args.input_vcf))
-    #snv part
+        raise Exception(
+            "ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format(
+                args.input_vcf
+            )
+        )
+    # snv part
 
     return vcf_reader, rna_vcf_reader, is_multi_sample
+
+
 ########################## Adding extra fields into Format ##########################
 def create_vcf_writer(args, vcf_reader):
-    (head, sep, tail) = args.input_vcf.rpartition('.vcf')
+    (head, sep, tail) = args.input_vcf.rpartition(".vcf")
     new_header = vcf_reader.header.copy()
-    if args.mode == 'gene':
-        new_header.add_format_line(vcfpy.OrderedDict([('ID', 'GX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Gene Expressions')]))
-        output_file = ('').join([head, '.gx.vcf', tail])
-    elif args.mode == 'transcript':
-        new_header.add_format_line(vcfpy.OrderedDict([('ID', 'TX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Transcript Expressions')]))
-        output_file = ('').join([head, '.tx.vcf', tail])
-    if (args.rna_vcf) :
-        new_header.add_format_line(vcfpy.OrderedDict([
-        ('ID','RAD'),('Number', '.'), ('Type', 'String'),('Description', 'Allelic depths from mRNA sequencing data of the same sample')
-        ]))
-        new_header.add_format_line(vcfpy.OrderedDict([
-        ('ID','ROT'),('Number', '.'), ('Type', 'String'),('Description', 'Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample')
-        ]))
-        (head,sep,tail) = output_file.rpartition('.vcf')
-        output_file = ('').join([head, '.pileup.vcf', tail])
+    if args.mode == "gene":
+        new_header.add_format_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "GX"),
+                    ("Number", "."),
+                    ("Type", "String"),
+                    ("Description", "Gene Expressions"),
+                ]
+            )
+        )
+        output_file = ("").join([head, ".gx.vcf", tail])
+    elif args.mode == "transcript":
+        new_header.add_format_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "TX"),
+                    ("Number", "."),
+                    ("Type", "String"),
+                    ("Description", "Transcript Expressions"),
+                ]
+            )
+        )
+        output_file = ("").join([head, ".tx.vcf", tail])
+    if args.rna_vcf:
+        new_header.add_format_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "RAD"),
+                    ("Number", "."),
+                    ("Type", "String"),
+                    (
+                        "Description",
+                        "Allelic depths from mRNA sequencing data of the same sample",
+                    ),
+                ]
+            )
+        )
+        new_header.add_format_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "ROT"),
+                    ("Number", "."),
+                    ("Type", "String"),
+                    (
+                        "Description",
+                        "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample",
+                    ),
+                ]
+            )
+        )
+        (head, sep, tail) = output_file.rpartition(".vcf")
+        output_file = ("").join([head, ".pileup.vcf", tail])
     if args.output_vcf:
         output_file = args.output_vcf
     return vcfpy.Writer.from_path(output_file, new_header)
 
+
 ########################## Auto assign gene id column based on different tools ##########################
 def resolve_id_column(args):
-    if args.format == 'cufflinks':
-        return 'tracking_id'
-    elif args.format == 'kallisto':
-        if args.mode == 'gene':
-            return 'gene'
-        elif args.mode == 'transcript':
-            return 'target_id'
-    elif args.format == 'stringtie':
-        if args.mode == 'gene':
-            return 'Gene ID'
-    elif args.format == 'star':
-        return "gene_id"                  # NEED change
-    elif args.format == 'custom':
+    if args.format == "cufflinks":
+        return "tracking_id"
+    elif args.format == "kallisto":
+        if args.mode == "gene":
+            return "gene"
+        elif args.mode == "transcript":
+            return "target_id"
+    elif args.format == "stringtie":
+        if args.mode == "gene":
+            return "Gene ID"
+    elif args.format == "star":
+        return "gene_id"  # NEED change
+    elif args.format == "custom":
         if args.id_column is None:
-            raise Exception("ERROR: `--id-column` option is required when using the `custom` format")
+            raise Exception(
+                "ERROR: `--id-column` option is required when using the `custom` format"
+            )
         else:
             return args.id_column
+
+
 ########################## Auto assign expression column based on different tools ##########################
 def resolve_expression_column(args):
-    if args.format == 'cufflinks':
-        return 'FPKM'
-    elif args.format == 'kallisto':
-        if args.mode == 'gene':
-            return 'abundance'
-        elif args.mode == 'transcript':
-            return 'tpm'
-    elif args.format == 'stringtie':
-        return 'TPM'
-    elif args.format == 'star':
+    if args.format == "cufflinks":
+        return "FPKM"
+    elif args.format == "kallisto":
+        if args.mode == "gene":
+            return "abundance"
+        elif args.mode == "transcript":
+            return "tpm"
+    elif args.format == "stringtie":
+        return "TPM"
+    elif args.format == "star":
         if args.expression_column is None:
-            raise Exception("ERROR: `--expression-column` option is required when using the `star` format")
+            raise Exception(
+                "ERROR: `--expression-column` option is required when using the `star` format"
+            )
         else:
-            return 'TPM_' + args.expression_column
-    elif (args.format == 'custom'):
+            return "TPM_" + args.expression_column
+    elif args.format == "custom":
         if args.expression_column is None:
-            raise Exception("ERROR: `--expression-column` option is required when using the `custom` format")
+            raise Exception(
+                "ERROR: `--expression-column` option is required when using the `custom` format"
+            )
         else:
             return args.expression_column
 
+
 def to_array(dictionary):
     array = []
     for key, value in dictionary.items():
         array.append("{}|{}".format(key, value))
     return sorted(array)
 
+
 def resolve_stringtie_id_column(args, headers):
-    if 'gene_id' in headers:
-        return 'gene_id'
+    if "gene_id" in headers:
+        return "gene_id"
     else:
-        return 'transcript_id'
+        return "transcript_id"
+
 
-#Star TPM calculation
-#If there is genecount file, GENECODE file is required to calculate TPM
-##########################Star TPM Calculation##########################
 def genecount_reader(genecount_path):
-    col_name=["gene_id","unstranded","rf","fr"]
-    star_genecount = pd.read_csv(genecount_path,
-                            skiprows=4,
-                            sep="\t",
-                            names=col_name,
-                            header=None,
-                            ).sort_values(by=["gene_id"])
-    #star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0])
-    star_genecount.unstranded=star_genecount.unstranded.astype(int,copy=None)
-    star_genecount.rf=star_genecount.rf.astype(int,copy=None)
-    star_genecount.fr=star_genecount.fr.astype(int,copy=None)
-    #star_genecount.set_index("gene_id")
+    col_name = ["gene_id", "unstranded", "rf", "fr"]
+    star_genecount = pd.read_csv(
+        genecount_path,
+        skiprows=4,
+        sep="\t",
+        names=col_name,
+        header=None,
+    ).sort_values(by=["gene_id"])
+    # star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0])
+    star_genecount.unstranded = star_genecount.unstranded.astype(int, copy=None)
+    star_genecount.rf = star_genecount.rf.astype(int, copy=None)
+    star_genecount.fr = star_genecount.fr.astype(int, copy=None)
+    # star_genecount.set_index("gene_id")
     return star_genecount
 
-def genecode_reader(genecode_path,star_genecount,expression_column):
-    gc = pr.read_gtf(genecode_path,as_df=True)
+
+def genecode_reader(genecode_path, star_genecount, expression_column):
+    gc = pr.read_gtf(genecode_path, as_df=True)
     gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])]
-    gc = gc[(gc.Feature=="gene")]
-    exon = gc[["Chromosome","Start","End","gene_id","gene_name"]]
+    gc = gc[(gc.Feature == "gene")]
+    exon = gc[["Chromosome", "Start", "End", "gene_id", "gene_name"]]
     # Convert columns to proper types.
     exon.Start = exon.Start.astype(int)
     exon.End = exon.End.astype(int)
     # Sort in place.
-    exon.sort_values(by=['Chromosome','Start','End'], inplace=True)
+    exon.sort_values(by=["Chromosome", "Start", "End"], inplace=True)
     # Group the rows by the Ensembl gene identifier (with version numbers.)
-    groups = exon.groupby('gene_id')
+    groups = exon.groupby("gene_id")
 
     lengths = groups.apply(count_bp)
     # Create a new DataFrame with gene lengths and EnsemblID.
     ensembl_no_version = lengths.index
-    ldf = pd.DataFrame({'length': lengths.values,
-                        'gene_id': ensembl_no_version})
-    star_tpm = pd.merge(ldf,star_genecount, how='inner',on='gene_id')
-    star_tpm["TPM_" + expression_column]= calculate_TPM(star_tpm,expression_column)
+    ldf = pd.DataFrame({"length": lengths.values, "gene_id": ensembl_no_version})
+    star_tpm = pd.merge(ldf, star_genecount, how="inner", on="gene_id")
+    star_tpm["TPM_" + expression_column] = calculate_TPM(star_tpm, expression_column)
     return star_tpm
 
-def calculate_TPM(df,strand):
-    effLen = (df["length"])/1000
-    rate = df[strand]/effLen
-    denom = sum(rate)/(10**6)
-    return rate/denom
+
+def calculate_TPM(df, strand):
+    effLen = (df["length"]) / 1000
+    rate = df[strand] / effLen
+    denom = sum(rate) / (10**6)
+    return rate / denom
+
 
 def count_bp(df):
     """Given a DataFrame with the exon coordinates from Gencode for a single
@@ -181,73 +254,99 @@ def count_bp(df):
     end = df.End.max()
     bp = [False] * (end - start + 1)
     for i in range(df.shape[0]):
-        s = df.iloc[i]['Start'] - start
-        e = df.iloc[i]['End'] - start + 1
+        s = df.iloc[i]["Start"] - start
+        e = df.iloc[i]["End"] - start + 1
         bp[s:e] = [True] * (e - s)
     return sum(bp)
 
-##########################Passing expression file##########################
+
 def parse_expression_file(args, vcf_reader, vcf_writer):
-    if args.format == 'stringtie' and args.mode == 'transcript':
+    if args.format == "stringtie" and args.mode == "transcript":
         df_all = pr.read_gtf(args.expression_file)
         df = df_all[df_all["feature"] == "transcript"]
         id_column = resolve_stringtie_id_column(args, df.columns.values)
-    elif args.format == "star" :
+    elif args.format == "star":
         id_column = resolve_id_column(args)
         star_genecount = genecount_reader(args.expression_file)
-        df = genecode_reader(args.genecode,star_genecount,args.expression_column)
+        df = genecode_reader(args.genecode, star_genecount, args.expression_column)
     else:
         id_column = resolve_id_column(args)
-        df = pd.read_csv(args.expression_file, sep='\t')
+        df = pd.read_csv(args.expression_file, sep="\t")
     if args.ignore_ensembl_id_version:
-        df['transcript_without_version'] = df[id_column].apply(lambda x: re.sub(r'\.[0-9]+$', '', x))
+        df["transcript_without_version"] = df[id_column].apply(
+            lambda x: re.sub(r"\.[0-9]+$", "", x)
+        )
     expression_column = resolve_expression_column(args)
     if expression_column not in df.columns.values:
         vcf_reader.close()
         vcf_writer.close()
-        raise Exception("ERROR: expression_column header {} does not exist in expression_file {}".format(expression_column, args.expression_file))
+        raise Exception(
+            "ERROR: expression_column header {} does not exist in expression_file {}".format(
+                expression_column, args.expression_file
+            )
+        )
     if id_column not in df.columns.values:
         vcf_reader.close()
         vcf_writer.close()
-        raise Exception("ERROR: id_column header {} does not exist in expression_file {}".format(id_column, args.expression_file))
+        raise Exception(
+            "ERROR: id_column header {} does not exist in expression_file {}".format(
+                id_column, args.expression_file
+            )
+        )
     return df, id_column, expression_column
 
-##########################Adding RNA AD ##########################
-def add_AD_rna_file(entry,is_multi_sample, sample_name,rna_vcf):
-    RAD_temp=[]
+
+def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf):
+    RAD_temp = []
     ROT = []
-    for rna_record in rna_vcf.fetch(entry.CHROM,entry.affected_start,entry.affected_end):
-        rna_alt= [alt.value for alt in rna_record.ALT]
-        rna_AD = [c.data.get('AD') for c in rna_record.calls][0]
-        ROT_temp=sum(rna_AD) - rna_AD[0]
+    for rna_record in rna_vcf.fetch(
+        entry.CHROM, entry.affected_start, entry.affected_end
+    ):
+        rna_alt = [alt.value for alt in rna_record.ALT]
+        rna_AD = [c.data.get("AD") for c in rna_record.calls][0]
+        ROT_temp = sum(rna_AD) - rna_AD[0]
         if not rna_alt:
             continue
         else:
-            rna_AD = [c.data.get('AD') for c in rna_record.calls][0]
+            rna_AD = [c.data.get("AD") for c in rna_record.calls][0]
             for dna_alt in entry.ALT:
                 if dna_alt.value in rna_alt:
-                    index=rna_record.ALT.index(dna_alt)
-                    RAD_temp.append(rna_AD[0])   #AD of REF
-                    RAD_temp.append(rna_AD[index+1]) #AD of ALT
-                    ROT_temp = ROT_temp - rna_AD[index+1]
+                    index = rna_record.ALT.index(dna_alt)
+                    RAD_temp.append(rna_AD[0])  # AD of REF
+                    RAD_temp.append(rna_AD[index + 1])  # AD of ALT
+                    ROT_temp = ROT_temp - rna_AD[index + 1]
         ROT.append(ROT_temp)
         if is_multi_sample:
-            entry.FORMAT += ['RAD']
-            entry.call_for_sample[sample_name].data['RAD'] = RAD_temp
-            entry.FORMAT += ['ROT']
-            entry.call_for_sample[sample_name].data['ROT'] = ROT
+            entry.FORMAT += ["RAD"]
+            entry.call_for_sample[sample_name].data["RAD"] = RAD_temp
+            entry.FORMAT += ["ROT"]
+            entry.call_for_sample[sample_name].data["ROT"] = ROT
         else:
-            entry.add_format('RAD', RAD_temp)
-            entry.add_format('ROT', ROT)
+            entry.add_format("RAD", RAD_temp)
+            entry.add_format("ROT", ROT)
     return entry
 
-##########################Adding Expression file ##########################
-def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_column, expression_column, ignore_ensembl_id_version, missing_expressions_count, entry_count):
+
+def add_expressions(
+    entry,
+    is_multi_sample,
+    sample_name,
+    df,
+    items,
+    tag,
+    id_column,
+    expression_column,
+    ignore_ensembl_id_version,
+    missing_expressions_count,
+    entry_count,
+):
     expressions = {}
     for item in items:
         entry_count += 1
         if ignore_ensembl_id_version:
-            subset = df.loc[df['transcript_without_version'] == re.sub(r'\.[0-9]+$', '', item)]
+            subset = df.loc[
+                df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item)
+            ]
         else:
             subset = df.loc[df[id_column] == item]
         if len(subset) > 0:
@@ -261,28 +360,24 @@ def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_colu
         entry.add_format(tag, to_array(expressions))
     return (entry, missing_expressions_count, entry_count)
 
+
 ########################## Define tool parameters ##########################
 def define_parser():
     parser = argparse.ArgumentParser(
         "comb_rna.py",
-        description = "A tool that will add the data from several expression tools' output files" +
-                      "and allelic depths from mRNA sequencing data for snvs" +
-                      "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star" +
-                      "and Cufflinks. There also is a ``custom`` option to annotate with data " +
-                      "from any tab-delimited file."
+        description="A tool that will add the data from several expression tools' output files"
+        + "and allelic depths from mRNA sequencing data for snvs"
+        + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star"
+        + "and Cufflinks. There also is a ``custom`` option to annotate with data "
+        + "from any tab-delimited file.",
     )
 
+    parser.add_argument("--input_vcf", help="A VEP-annotated VCF file")
     parser.add_argument(
-        "--input_vcf",
-        help="A VEP-annotated VCF file"
-    )
-    parser.add_argument(
-        "--expression_file",
-        help="A TSV file containing expression estimates"
+        "--expression_file", help="A TSV file containing expression estimates"
     )
     parser.add_argument(
-        "--genecode",
-        help="A genecode file for calculate TPM from star gene count"
+        "--genecode", help="A genecode file for calculate TPM from star gene count"
     )
     parser.add_argument(
         "--rna-vcf",
@@ -290,103 +385,165 @@ def define_parser():
     )
     parser.add_argument(
         "--format",
-        choices=['kallisto','stringtie','cufflinks','star','custom'],
+        choices=["kallisto", "stringtie", "cufflinks", "star", "custom"],
         help="The file format of the expression file to process. "
-            +"Use `custom` to process file formats not explicitly supported. "
-            +"The `custom` option requires the use of the --id-column and --expression-column arguments."
+        + "Use `custom` to process file formats not explicitly supported. "
+        + "The `custom` option requires the use of the --id-column and --expression-column arguments.",
     )
     parser.add_argument(
         "--mode",
-        choices=['gene', 'transcript'],
-        help="The type of expression data in the expression_file"
+        choices=["gene", "transcript"],
+        help="The type of expression data in the expression_file",
     )
     parser.add_argument(
-        "-i", "--id-column",
-        help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format."
+        "-i",
+        "--id-column",
+        help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.",
     )
     parser.add_argument(
-        "-e", "--expression-column",
-        help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format."
+        "-e",
+        "--expression-column",
+        help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format.",
     )
     parser.add_argument(
-        "-s", "--sample-name",
-        help="If the input_vcf contains multiple samples, the name of the sample to annotate."
+        "-s",
+        "--sample-name",
+        help="If the input_vcf contains multiple samples, the name of the sample to annotate.",
     )
     parser.add_argument(
-        "-o", "--output-vcf",
+        "-o",
+        "--output-vcf",
         help="Path to write the output VCF file. If not provided, the output VCF file will be "
-            +"written next to the input VCF file with a .tx.vcf or .gx.vcf file ending."
+        + "written next to the input VCF file with a .tx.vcf or .gx.vcf file ending.",
     )
     parser.add_argument(
         "--ignore-ensembl-id-version",
         help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").',
-        action="store_true"
+        action="store_true",
     )
 
     return parser
 
-def main(args_input = sys.argv[1:]):
-    parser = define_parser()
-    args = parser.parse_args(args_input)
-    CONTIGS = ['chr{}'.format(i) for i in range(1, 23)] + ['chrX', 'chrY']
-    if args.format == 'custom':
+
+def args_check(args):
+    if args.format == "custom":
         if args.id_column is None:
-            raise Exception("--id-column is not set. This is required when using the `custom` format.")
+            raise Exception(
+                "--id-column is not set. This is required when using the `custom` format."
+            )
         if args.expression_column is None:
-            raise Exception("--expression-column is not set. This is required when using the `custom` format.")
-    if args.format == 'star':
+            raise Exception(
+                "--expression-column is not set. This is required when using the `custom` format."
+            )
+    if args.format == "star":
         if args.genecode is None:
-            raise Exception("--genecode is not set. This is required when using the `star` format")
-        exp_col = ["unstranded","rf","fr"]
+            raise Exception(
+                "--genecode is not set. This is required when using the `star` format"
+            )
+        exp_col = ["unstranded", "rf", "fr"]
         if args.expression_column not in exp_col:
-            raise Exception("""
+            raise Exception(
+                """
                             --expression-column is not found. Please choose :
                             unstranded - Assumes a unstranded library.
                             rf - Assumes a stranded library fr-firststrand.
                             fr - Assumes a stranded library fr-secondstrand.
-                            """)
-    (vcf_reader,rna_vcf_reader, is_multi_sample) = create_vcf_reader(args)
-    format_pattern = re.compile('Format: (.*)')
-    csq_format = format_pattern.search(vcf_reader.header.get_info_field_info('CSQ').description).group(1).split('|')
+                            """
+            )
+
+
+def adding_extra_information(args):
+    CONTIGS = ["chr{}".format(i) for i in range(1, 23)] + ["chrX", "chrY"]
+    (vcf_reader, rna_vcf_reader, is_multi_sample) = create_vcf_reader(args)
+    format_pattern = re.compile("Format: (.*)")
+    csq_format = (
+        format_pattern.search(vcf_reader.header.get_info_field_info("CSQ").description)
+        .group(1)
+        .split("|")
+    )
 
     vcf_writer = create_vcf_writer(args, vcf_reader)
-    (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer)
+    (df, id_column, expression_column) = parse_expression_file(
+        args, vcf_reader, vcf_writer
+    )
     missing_expressions_count = 0
     entry_count = 0
     for entry in vcf_reader:
-        #Add expression data
+        # Add expression data
         transcript_ids = set()
         genes = set()
-        if 'CSQ' not in entry.INFO:
-            logging.warning("Variant is missing VEP annotation. INFO column doesn't contain CSQ field for variant {}".format(entry))
+        if "CSQ" not in entry.INFO:
+            logging.warning(
+                "Variant is missing VEP annotation. INFO column doesn't contain CSQ field for variant {}".format(
+                    entry
+                )
+            )
             vcf_writer.write_record(entry)
             continue
-        for transcript in entry.INFO['CSQ']:
-            for key, value in zip(csq_format, transcript.split('|')):
-                if key == 'Feature' and value != '' and not value.startswith('ENSR'):
+        for transcript in entry.INFO["CSQ"]:
+            for key, value in zip(csq_format, transcript.split("|")):
+                if key == "Feature" and value != "" and not value.startswith("ENSR"):
                     transcript_ids.add(value)
-                if key == 'Gene' and value != '':
+                if key == "Gene" and value != "":
                     genes.add(value)
 
-        if args.mode == 'gene':
+        if args.mode == "gene":
             genes = list(genes)
             if len(genes) > 0:
-                (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, genes, 'GX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count)
-        elif args.mode == 'transcript':
+                (entry, missing_expressions_count, entry_count) = add_expressions(
+                    entry,
+                    is_multi_sample,
+                    args.sample_name,
+                    df,
+                    genes,
+                    "GX",
+                    id_column,
+                    expression_column,
+                    args.ignore_ensembl_id_version,
+                    missing_expressions_count,
+                    entry_count,
+                )
+        elif args.mode == "transcript":
             transcript_ids = list(transcript_ids)
             if len(transcript_ids) > 0:
-                (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, transcript_ids, 'TX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count)
-        #Add RAD and ROT
-        if  rna_vcf_reader is not None:
+                (entry, missing_expressions_count, entry_count) = add_expressions(
+                    entry,
+                    is_multi_sample,
+                    args.sample_name,
+                    df,
+                    transcript_ids,
+                    "TX",
+                    id_column,
+                    expression_column,
+                    args.ignore_ensembl_id_version,
+                    missing_expressions_count,
+                    entry_count,
+                )
+        # Add RAD and ROT
+        if rna_vcf_reader is not None:
             if (entry.CHROM in CONTIGS) and (entry.is_snv()):
-                entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name,rna_vcf_reader)
+                entry = add_AD_rna_file(
+                    entry, is_multi_sample, args.sample_name, rna_vcf_reader
+                )
         vcf_writer.write_record(entry)
 
     vcf_reader.close()
     vcf_writer.close()
 
     if missing_expressions_count > 0:
-        logging.warning("{} of {} {}s did not have an expression entry for their {} id.".format(missing_expressions_count, entry_count, args.mode, args.mode))
+        logging.warning(
+            "{} of {} {}s did not have an expression entry for their {} id.".format(
+                missing_expressions_count, entry_count, args.mode, args.mode
+            )
+        )
+
+
+def main(args_input=sys.argv[1:]):
+    parser = define_parser()
+    args = parser.parse_args(args_input)
+    args_check(args)
+    adding_extra_information(args)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index 1dd23a267..760b63bd6 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -12,14 +12,14 @@
 step = snakemake.config["pipeline_step"]["name"]
 config = snakemake.config["step_config"][step]
 
-exp_format = config['preparation']['format']
+exp_format = config["preparation"]["format"]
 preparation_vcf = os.path.join(
     os.path.dirname(__file__),
     "comb_rna.py",
 )
 ensemble_id = (
     "--ignore-ensembl-id-version"
-    if config['preparation']['ignore-ensembl-id-version']
+    if config["preparation"]["ignore-ensembl-id-version"]
     else ""
 )
 

From 4e08a50648efc4c6b9dff949cd01ac79c4189f92 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 4 Apr 2024 13:25:02 +0200
Subject: [PATCH 03/23] Satisfying black

---
 .../somatic_neoepitope_prediction/__init__.py | 56 ++++++-------------
 .../wrappers/pvactools/combining/comb_rna.py  | 28 +++-------
 .../wrappers/pvactools/combining/wrapper.py   |  4 +-
 snappy_wrappers/wrappers/stringtie/wrapper.py |  2 +-
 ...workflows_somatic_neoepitope_prediction.py | 41 ++++++++------
 5 files changed, 50 insertions(+), 81 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index 6366c5577..af5776df3 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -145,9 +145,7 @@ def get_input_files(self, action):
                                         library_name=rna_library,
                                     )
                                     ext = {"expression", "bam", "bai"}
-                                    yield "expression", ngs_mapping(
-                                        rna_tpl + ".GeneCounts.tab"
-                                    )
+                                    yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab")
                                     yield "bam", ngs_mapping(rna_tpl + ".bam")
                                     yield "bai", ngs_mapping(rna_tpl + ".bam.bai")
 
@@ -158,9 +156,7 @@ def get_output_files(self, action):
         # Need to add step for adding RAD also
         self._validate_action(action)
         if (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "preparation"
-            ]["mode"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
             == "gene"
         ):
             prefix = (
@@ -168,9 +164,7 @@ def get_output_files(self, action):
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
         elif (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "preparation"
-            ]["mode"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
             == "transcript"
         ):
             prefix = (
@@ -188,9 +182,7 @@ def _get_log_file(self, action):
         # Validate action
         self._validate_action(action)
         if (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "preparation"
-            ]["mode"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
             == "gene"
         ):
             prefix = (
@@ -198,9 +190,7 @@ def _get_log_file(self, action):
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
         elif (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "preparation"
-            ]["mode"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
             == "transcript"
         ):
             prefix = (
@@ -237,16 +227,12 @@ def get_result_files(self):
             ]
         )
         if (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "preparation"
-            ]["mode"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
             == "gene"
         ):
             name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}"
         elif (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "preparation"
-            ]["mode"]
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
             == "transcript"
         ):
             name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}"
@@ -309,9 +295,7 @@ class SomaticNeoepitopePredictionWorkflow(BaseStep):
     name = "neoepitope_prediction"
     sheet_shortcut_class = CancerCaseSheet
     sheet_shortcut_kwargs = {
-        "options": CancerCaseSheetOptions(
-            allow_missing_normal=True, allow_missing_tumor=True
-        )
+        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
     }
 
     @classmethod
@@ -342,14 +326,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
         )
         self.register_sub_workflow(
             "ngs_mapping",
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "path_rna_ngs_mapping"
-            ],
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"],
         )
         # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "tools_ngs_mapping"
-        ]:
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]:
             self.w_config["step_config"]["somatic_neoepitope_prediction"][
                 "tools_ngs_mapping"
             ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]
@@ -365,18 +345,16 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             self.w_config["step_config"]["somatic_neoepitope_prediction"][
                 "tools_somatic_variant_annotation"
             ] = ["vep"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "tools_rna_mapping"
-        ]:
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]:
             self.w_config["step_config"]["somatic_neoepitope_prediction"][
                 "tools_rna_mapping"
             ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "preparation"
-        ]["path_features"]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "preparation"
-            ]["path_features"] = self.w_config["static_data_config"]["features"]["path"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
+            "path_features"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
+                "path_features"
+            ] = self.w_config["static_data_config"]["features"]["path"]
 
     def get_result_files(self):
         for sub_step in self.sub_steps.values():
diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
index ef40f0151..9b3a6956e 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -299,9 +299,7 @@ def parse_expression_file(args, vcf_reader, vcf_writer):
 def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf):
     RAD_temp = []
     ROT = []
-    for rna_record in rna_vcf.fetch(
-        entry.CHROM, entry.affected_start, entry.affected_end
-    ):
+    for rna_record in rna_vcf.fetch(entry.CHROM, entry.affected_start, entry.affected_end):
         rna_alt = [alt.value for alt in rna_record.ALT]
         rna_AD = [c.data.get("AD") for c in rna_record.calls][0]
         ROT_temp = sum(rna_AD) - rna_AD[0]
@@ -344,9 +342,7 @@ def add_expressions(
     for item in items:
         entry_count += 1
         if ignore_ensembl_id_version:
-            subset = df.loc[
-                df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item)
-            ]
+            subset = df.loc[df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item)]
         else:
             subset = df.loc[df[id_column] == item]
         if len(subset) > 0:
@@ -373,12 +369,8 @@ def define_parser():
     )
 
     parser.add_argument("--input_vcf", help="A VEP-annotated VCF file")
-    parser.add_argument(
-        "--expression_file", help="A TSV file containing expression estimates"
-    )
-    parser.add_argument(
-        "--genecode", help="A genecode file for calculate TPM from star gene count"
-    )
+    parser.add_argument("--expression_file", help="A TSV file containing expression estimates")
+    parser.add_argument("--genecode", help="A genecode file for calculate TPM from star gene count")
     parser.add_argument(
         "--rna-vcf",
         help="A VCF file of RNA-seq data",
@@ -437,9 +429,7 @@ def args_check(args):
             )
     if args.format == "star":
         if args.genecode is None:
-            raise Exception(
-                "--genecode is not set. This is required when using the `star` format"
-            )
+            raise Exception("--genecode is not set. This is required when using the `star` format")
         exp_col = ["unstranded", "rf", "fr"]
         if args.expression_column not in exp_col:
             raise Exception(
@@ -463,9 +453,7 @@ def adding_extra_information(args):
     )
 
     vcf_writer = create_vcf_writer(args, vcf_reader)
-    (df, id_column, expression_column) = parse_expression_file(
-        args, vcf_reader, vcf_writer
-    )
+    (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer)
     missing_expressions_count = 0
     entry_count = 0
     for entry in vcf_reader:
@@ -522,9 +510,7 @@ def adding_extra_information(args):
         # Add RAD and ROT
         if rna_vcf_reader is not None:
             if (entry.CHROM in CONTIGS) and (entry.is_snv()):
-                entry = add_AD_rna_file(
-                    entry, is_multi_sample, args.sample_name, rna_vcf_reader
-                )
+                entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name, rna_vcf_reader)
         vcf_writer.write_record(entry)
 
     vcf_reader.close()
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index 760b63bd6..9fbd6be11 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -18,9 +18,7 @@
     "comb_rna.py",
 )
 ensemble_id = (
-    "--ignore-ensembl-id-version"
-    if config["preparation"]["ignore-ensembl-id-version"]
-    else ""
+    "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else ""
 )
 
 shell(
diff --git a/snappy_wrappers/wrappers/stringtie/wrapper.py b/snappy_wrappers/wrappers/stringtie/wrapper.py
index f71436713..0ded26e3d 100644
--- a/snappy_wrappers/wrappers/stringtie/wrapper.py
+++ b/snappy_wrappers/wrappers/stringtie/wrapper.py
@@ -44,4 +44,4 @@
 md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5}
 md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5}
 """
-)
\ No newline at end of file
+)
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
index eac6138d7..960aff0e3 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
@@ -83,6 +83,7 @@ def minimal_config():
         ).lstrip()
     )
 
+
 @pytest.fixture
 def somatic_neoepitope_prediction_workflow(
     dummy_workflow,
@@ -110,6 +111,7 @@ def somatic_neoepitope_prediction_workflow(
         work_dir,
     )
 
+
 def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_prediction_workflow):
     # Define expected
     vcf_base_out = (
@@ -129,33 +131,43 @@ def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_pre
     actual = somatic_neoepitope_prediction_workflow.get_input_files("neoepitope_preparation", "run")
     assert actual == expected
 
+
 def test_neoepitope_preparation_step_part_get_output_files(somatic_neoepitope_prediction_workflow):
     base_out = (
-                "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
-                "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
-            )
+        "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
+        "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+    )
     expected = get_expected_output_vcf_files_dict(base_out)
-    actual = somatic_neoepitope_prediction_workflow.get_output_files("neoepitope_preparation", "run")
+    actual = somatic_neoepitope_prediction_workflow.get_output_files(
+        "neoepitope_preparation", "run"
+    )
     assert actual == expected
 
+
 def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_prediction_workflow):
     base_out = (
         "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
-                "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+        "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
     )
     expected = get_expected_log_files_dict(base_out=base_out)
     actual = somatic_neoepitope_prediction_workflow.get_log_file("neoepitope_preparation", "run")
     assert actual == expected
 
-def test_neoepitope_preparation_step_part_get_resource_usage(somatic_neoepitope_prediction_workflow):
+
+def test_neoepitope_preparation_step_part_get_resource_usage(
+    somatic_neoepitope_prediction_workflow,
+):
     # Define expected
     expected_dict = {"threads": 2, "time": "1:00:00", "memory": "6144M"}
     # Evaluate
     for resource, expected in expected_dict.items():
         msg_error = f"Assertion error for resource '{resource}'."
-        actual = somatic_neoepitope_prediction_workflow.get_resource("neoepitope_preparation", "run", resource)
+        actual = somatic_neoepitope_prediction_workflow.get_resource(
+            "neoepitope_preparation", "run", resource
+        )
         assert actual == expected, msg_error
 
+
 def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_workflow):
     """Test simple functionality of the workflow"""
     # Check created sub steps
@@ -178,14 +190,9 @@ def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_wo
             ext=ext,
             dir_="out",
         )
-        for i,t in ((1, 1), (2, 2))
-        for ext in (
-            "vcf.gz",
-            "vcf.gz.tbi",
-            "vcf.gz.md5",
-            "vcf.gz.tbi.md5"
-        )
-        for mapper in("bwa",)
+        for i, t in ((1, 1), (2, 2))
+        for ext in ("vcf.gz", "vcf.gz.tbi", "vcf.gz.md5", "vcf.gz.tbi.md5")
+        for mapper in ("bwa",)
         for var_caller in ("mutect2", "scalpel")
         for anno_caller in ("vep",)
     ]
@@ -200,7 +207,7 @@ def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_wo
             ext=ext,
             dir_="log",
         )
-        for i,t in ((1, 1), (2, 2))
+        for i, t in ((1, 1), (2, 2))
         for ext in (
             "conda_info.txt",
             "conda_list.txt",
@@ -209,7 +216,7 @@ def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_wo
             "conda_list.txt.md5",
             "log.md5",
         )
-        for mapper in("bwa",)
+        for mapper in ("bwa",)
         for var_caller in ("mutect2", "scalpel")
         for anno_caller in ("vep",)
     ]

From 0c35c665bd2de1eea2b43da960e124dc18d5f168 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 4 Apr 2024 13:28:35 +0200
Subject: [PATCH 04/23] Satisfying snakefmt

---
 .../workflows/somatic_neoepitope_prediction/Snakefile         | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
index 0fcffcce3..7fbb4bf6d 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
@@ -21,9 +21,7 @@ configfile: "config.yaml"
 config, lookup_paths, config_paths = expand_ref("config.yaml", config)
 
 # WorkflowImpl Object Setup ===================================================
-wf = SomaticNeoepitopePredictionWorkflow(
-    workflow, config, lookup_paths, config_paths, os.getcwd()
-)
+wf = SomaticNeoepitopePredictionWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
 
 
 localrules:

From b4b36ba5678bd019bbff4cf5895855fe9d909721 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 4 Apr 2024 13:33:09 +0200
Subject: [PATCH 05/23] make isort comfort

---
 .../somatic_neoepitope_prediction/__init__.py    | 16 +++-------------
 .../wrappers/pvactools/combining/comb_rna.py     |  9 +++++----
 .../wrappers/pvactools/combining/wrapper.py      |  1 +
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index af5776df3..898ce1c55 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -35,27 +35,17 @@
 import os
 import sys
 
-from biomedsheets.shortcuts import (
-    CancerCaseSheet,
-    CancerCaseSheetOptions,
-    is_not_background,
-)
+from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
 from snakemake.io import expand
 
 from snappy_pipeline.utils import dictify, listify
 from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart
-from snappy_pipeline.workflows.ngs_mapping import (
-    NgsMappingWorkflow,
-    ResourceUsage,
-)
+from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage
+from snappy_pipeline.workflows.somatic_variant_annotation import SomaticVariantAnnotationWorkflow
 from snappy_pipeline.workflows.somatic_variant_calling import (
     SOMATIC_VARIANT_CALLERS_MATCHED,
     SomaticVariantCallingWorkflow,
 )
-from snappy_pipeline.workflows.somatic_variant_annotation import (
-    SomaticVariantAnnotationWorkflow,
-)
-
 
 __author__ = "Pham Gia Cuong"
 __email__ = "pham.gia-cuong@bih-charite.de"
diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
index 9b3a6956e..8169f2366 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -1,10 +1,11 @@
 import argparse
+import logging
+import re
 import sys
-import vcfpy
-import pyranges as pr
+
 import pandas as pd
-import re
-import logging
+import pyranges as pr
+import vcfpy
 
 
 ########################## Reading vcf files ##########################
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index 9fbd6be11..b418ca3c8 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -4,6 +4,7 @@
 """
 
 import os
+
 from snakemake.shell import shell
 
 __author__ = "Pham Gia Cuong"

From f1af42ce56e4b2520b84fb9584d42181c16e788a Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 22 May 2024 13:08:28 +0200
Subject: [PATCH 06/23] update neoepitope prediction

---
 .../somatic_neoepitope_prediction/__init__.py | 143 +++++++++---------
 .../wrappers/pvactools/combining/comb_rna.py  |  92 ++++++-----
 .../wrappers/pvactools/combining/wrapper.py   |   4 +
 3 files changed, 117 insertions(+), 122 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index 898ce1c55..a397b2401 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -204,6 +204,74 @@ def get_resource_usage(self, action):
             memory=f"{mem_mb}M",
         )
 
+
+class SomaticNeoepitopePredictionWorkflow(BaseStep):
+    """Perform neoepitope prediction workflow"""
+
+    name = "neoepitope_prediction"
+    sheet_shortcut_class = CancerCaseSheet
+    sheet_shortcut_kwargs = {
+        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
+    }
+
+    @classmethod
+    def default_config_yaml(cls):
+        """Return default config YAML, to be overwritten by project-specific one."""
+        return DEFAULT_CONFIG
+
+    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
+        super().__init__(
+            workflow,
+            config,
+            config_lookup_paths,
+            config_paths,
+            workdir,
+            (
+                SomaticVariantCallingWorkflow,
+                SomaticVariantAnnotationWorkflow,
+                NgsMappingWorkflow,
+            ),
+        )
+        # Register sub step classes so the sub steps are available
+        self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart))
+        self.register_sub_workflow(
+            "somatic_variant_annotation",
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "path_somatic_variant_annotation"
+            ],
+        )
+        self.register_sub_workflow(
+            "ngs_mapping",
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"],
+        )
+        # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_ngs_mapping"
+            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "tools_somatic_variant_calling"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_somatic_variant_calling"
+            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
+            "tools_somatic_variant_annotation"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_somatic_variant_annotation"
+            ] = ["vep"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"][
+                "tools_rna_mapping"
+            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"]
+        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
+            "path_features"
+        ]:
+            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
+                "path_features"
+            ] = self.w_config["static_data_config"]["features"]["path"]
+
     @listify
     def get_result_files(self):
         callers = set(
@@ -259,7 +327,7 @@ def _yield_result_files_matched(self, tpl, **kwargs):
         This function returns the results from the matched somatic variant callers such as
         Mutect.
         """
-        for sheet in filter(is_not_background, self.parent.shortcut_sheets):
+        for sheet in filter(is_not_background, self.shortcut_sheets):
             for sample_pair in sheet.all_sample_pairs:
                 if (
                     not sample_pair.tumor_sample.dna_ngs_library
@@ -278,79 +346,6 @@ def _yield_result_files_matched(self, tpl, **kwargs):
                     **kwargs,
                 )
 
-
-class SomaticNeoepitopePredictionWorkflow(BaseStep):
-    """Perform neoepitope prediction workflow"""
-
-    name = "neoepitope_prediction"
-    sheet_shortcut_class = CancerCaseSheet
-    sheet_shortcut_kwargs = {
-        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
-    }
-
-    @classmethod
-    def default_config_yaml(cls):
-        """Return default config YAML, to be overwritten by project-specific one."""
-        return DEFAULT_CONFIG
-
-    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
-        super().__init__(
-            workflow,
-            config,
-            config_lookup_paths,
-            config_paths,
-            workdir,
-            (
-                SomaticVariantCallingWorkflow,
-                SomaticVariantAnnotationWorkflow,
-                NgsMappingWorkflow,
-            ),
-        )
-        # Register sub step classes so the sub steps are available
-        self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart))
-        self.register_sub_workflow(
-            "somatic_variant_annotation",
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "path_somatic_variant_annotation"
-            ],
-        )
-        self.register_sub_workflow(
-            "ngs_mapping",
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"],
-        )
-        # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_ngs_mapping"
-            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "tools_somatic_variant_calling"
-        ]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_somatic_variant_calling"
-            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "tools_somatic_variant_annotation"
-        ]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_somatic_variant_annotation"
-            ] = ["vep"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_rna_mapping"
-            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
-            "path_features"
-        ]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
-                "path_features"
-            ] = self.w_config["static_data_config"]["features"]["path"]
-
-    def get_result_files(self):
-        for sub_step in self.sub_steps.values():
-            if sub_step.name not in (LinkOutStepPart.name,):
-                yield from sub_step.get_result_files()
-
     def check_config(self):
         """Check that the path to the NGS mapping is present"""
         self.ensure_w_config(
diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
index 8169f2366..8a2d596ff 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -67,7 +67,7 @@ def create_vcf_writer(args, vcf_reader):
                 [
                     ("ID", "GX"),
                     ("Number", "."),
-                    ("Type", "String"),
+                    ("Type", "Float"),
                     ("Description", "Gene Expressions"),
                 ]
             )
@@ -79,7 +79,7 @@ def create_vcf_writer(args, vcf_reader):
                 [
                     ("ID", "TX"),
                     ("Number", "."),
-                    ("Type", "String"),
+                    ("Type", "Float"),
                     ("Description", "Transcript Expressions"),
                 ]
             )
@@ -91,7 +91,7 @@ def create_vcf_writer(args, vcf_reader):
                 [
                     ("ID", "RAD"),
                     ("Number", "."),
-                    ("Type", "String"),
+                    ("Type", "Integer"),
                     (
                         "Description",
                         "Allelic depths from mRNA sequencing data of the same sample",
@@ -104,7 +104,7 @@ def create_vcf_writer(args, vcf_reader):
                 [
                     ("ID", "ROT"),
                     ("Number", "."),
-                    ("Type", "String"),
+                    ("Type", "Integer"),
                     (
                         "Description",
                         "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample",
@@ -154,12 +154,8 @@ def resolve_expression_column(args):
     elif args.format == "stringtie":
         return "TPM"
     elif args.format == "star":
-        if args.expression_column is None:
-            raise Exception(
-                "ERROR: `--expression-column` option is required when using the `star` format"
-            )
-        else:
-            return "TPM_" + args.expression_column
+        args.expression_column = "read_count"
+        return "TPM_" + args.expression_column
     elif args.format == "custom":
         if args.expression_column is None:
             raise Exception(
@@ -184,7 +180,7 @@ def resolve_stringtie_id_column(args, headers):
 
 
 def genecount_reader(genecount_path):
-    col_name = ["gene_id", "unstranded", "rf", "fr"]
+    col_name = ["gene_id", "read_count"]
     star_genecount = pd.read_csv(
         genecount_path,
         skiprows=4,
@@ -193,14 +189,15 @@ def genecount_reader(genecount_path):
         header=None,
     ).sort_values(by=["gene_id"])
     # star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0])
-    star_genecount.unstranded = star_genecount.unstranded.astype(int, copy=None)
-    star_genecount.rf = star_genecount.rf.astype(int, copy=None)
-    star_genecount.fr = star_genecount.fr.astype(int, copy=None)
+    star_genecount.read_count = star_genecount.read_count.astype(int, copy=None)
+    # star_genecount.rf = star_genecount.rf.astype(int, copy=None)
+    # star_genecount.fr = star_genecount.fr.astype(int, copy=None)
     # star_genecount.set_index("gene_id")
     return star_genecount
 
 
-def genecode_reader(genecode_path, star_genecount, expression_column):
+def genecode_reader(genecode_path, star_genecount):
+    expression_column = "read_count"
     gc = pr.read_gtf(genecode_path, as_df=True)
     gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])]
     gc = gc[(gc.Feature == "gene")]
@@ -269,7 +266,7 @@ def parse_expression_file(args, vcf_reader, vcf_writer):
     elif args.format == "star":
         id_column = resolve_id_column(args)
         star_genecount = genecount_reader(args.expression_file)
-        df = genecode_reader(args.genecode, star_genecount, args.expression_column)
+        df = genecode_reader(args.genecode, star_genecount)
     else:
         id_column = resolve_id_column(args)
         df = pd.read_csv(args.expression_file, sep="\t")
@@ -294,19 +291,36 @@ def parse_expression_file(args, vcf_reader, vcf_writer):
                 id_column, args.expression_file
             )
         )
-    return df, id_column, expression_column
+    if (args.ignore_ensembl_id_version):
+        df[id_column] = df[id_column].apply(
+            lambda x: re.sub(r"\.[0-9]+$", "", x)
+        )
+    exp_df = df[[id_column, expression_column]]
+    exp_dict = exp_df.set_index(id_column)[expression_column].to_dict()
+    return exp_dict
 
 
 def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf):
     RAD_temp = []
     ROT = []
-    for rna_record in rna_vcf.fetch(entry.CHROM, entry.affected_start, entry.affected_end):
+    try:
+        rna_records = rna_vcf.fetch(entry.CHROM, entry.affected_start, entry.affected_end)
+        rna_record = next(rna_records)
+    except (StopIteration, ValueError):
+        # if there is no snp. RAD and RoT will be set to 0
+        if is_multi_sample:
+            entry.FORMAT += ["RAD"]
+            entry.call_for_sample[sample_name].data["RAD"] = 0
+            entry.FORMAT += ["ROT"]
+            entry.call_for_sample[sample_name].data["ROT"] = 0
+        else:
+            entry.add_format("RAD", 0)
+            entry.add_format("ROT", 0)
+    else:
         rna_alt = [alt.value for alt in rna_record.ALT]
         rna_AD = [c.data.get("AD") for c in rna_record.calls][0]
         ROT_temp = sum(rna_AD) - rna_AD[0]
-        if not rna_alt:
-            continue
-        else:
+        if rna_alt:
             rna_AD = [c.data.get("AD") for c in rna_record.calls][0]
             for dna_alt in entry.ALT:
                 if dna_alt.value in rna_alt:
@@ -330,11 +344,9 @@ def add_expressions(
     entry,
     is_multi_sample,
     sample_name,
-    df,
+    exp_dict,
     items,
     tag,
-    id_column,
-    expression_column,
     ignore_ensembl_id_version,
     missing_expressions_count,
     entry_count,
@@ -343,11 +355,9 @@ def add_expressions(
     for item in items:
         entry_count += 1
         if ignore_ensembl_id_version:
-            subset = df.loc[df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item)]
-        else:
-            subset = df.loc[df[id_column] == item]
-        if len(subset) > 0:
-            expressions[item] = subset[expression_column].sum()
+            item = re.sub(r"\.[0-9]+$", "", item)
+        if item in exp_dict:
+            expressions[item] = exp_dict[item]
         else:
             missing_expressions_count += 1
     if is_multi_sample:
@@ -431,21 +441,12 @@ def args_check(args):
     if args.format == "star":
         if args.genecode is None:
             raise Exception("--genecode is not set. This is required when using the `star` format")
-        exp_col = ["unstranded", "rf", "fr"]
-        if args.expression_column not in exp_col:
-            raise Exception(
-                """
-                            --expression-column is not found. Please choose :
-                            unstranded - Assumes a unstranded library.
-                            rf - Assumes a stranded library fr-firststrand.
-                            fr - Assumes a stranded library fr-secondstrand.
-                            """
-            )
 
 
 def adding_extra_information(args):
-    CONTIGS = ["chr{}".format(i) for i in range(1, 23)] + ["chrX", "chrY"]
     (vcf_reader, rna_vcf_reader, is_multi_sample) = create_vcf_reader(args)
+    # Only get contigs from vcf input file
+    CONTIGS = [x.id for x in vcf_reader.header.get_lines("contig")]
     format_pattern = re.compile("Format: (.*)")
     csq_format = (
         format_pattern.search(vcf_reader.header.get_info_field_info("CSQ").description)
@@ -454,7 +455,7 @@ def adding_extra_information(args):
     )
 
     vcf_writer = create_vcf_writer(args, vcf_reader)
-    (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer)
+    exp_dict = parse_expression_file(args, vcf_reader, vcf_writer)
     missing_expressions_count = 0
     entry_count = 0
     for entry in vcf_reader:
@@ -483,11 +484,9 @@ def adding_extra_information(args):
                     entry,
                     is_multi_sample,
                     args.sample_name,
-                    df,
+                    exp_dict,
                     genes,
                     "GX",
-                    id_column,
-                    expression_column,
                     args.ignore_ensembl_id_version,
                     missing_expressions_count,
                     entry_count,
@@ -499,12 +498,9 @@ def adding_extra_information(args):
                     entry,
                     is_multi_sample,
                     args.sample_name,
-                    df,
+                    exp_dict,
                     transcript_ids,
                     "TX",
-                    id_column,
-                    expression_column,
-                    args.ignore_ensembl_id_version,
                     missing_expressions_count,
                     entry_count,
                 )
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index b418ca3c8..aaa917a33 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -21,6 +21,9 @@
 ensemble_id = (
     "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else ""
 )
+id_column = (
+    "--id-column " + str(config["preparation"]["id-column"]) if  config["preparation"]["format"] == "custom" else ""
+)
 
 shell(
     r"""
@@ -73,6 +76,7 @@
     -e {config[preparation][expression-column]} \
     -s {snakemake.wildcards.tumor_library} \
     -o /dev/stdout \
+    "{id_column}" \
     "{ensemble_id}" \
 | bgzip -c > {snakemake.output.vcf}
 tabix -f {snakemake.output.vcf}

From 2fcf1bfca70d28f5b98bdda0a7a156028f1c6f2d Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 22 May 2024 13:28:08 +0200
Subject: [PATCH 07/23] Make black satisfy

---
 snappy_wrappers/wrappers/pvactools/combining/comb_rna.py | 6 ++----
 snappy_wrappers/wrappers/pvactools/combining/wrapper.py  | 4 +++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
index 8a2d596ff..b28d3eedc 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -291,10 +291,8 @@ def parse_expression_file(args, vcf_reader, vcf_writer):
                 id_column, args.expression_file
             )
         )
-    if (args.ignore_ensembl_id_version):
-        df[id_column] = df[id_column].apply(
-            lambda x: re.sub(r"\.[0-9]+$", "", x)
-        )
+    if args.ignore_ensembl_id_version:
+        df[id_column] = df[id_column].apply(lambda x: re.sub(r"\.[0-9]+$", "", x))
     exp_df = df[[id_column, expression_column]]
     exp_dict = exp_df.set_index(id_column)[expression_column].to_dict()
     return exp_dict
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index aaa917a33..3d698637f 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -22,7 +22,9 @@
     "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else ""
 )
 id_column = (
-    "--id-column " + str(config["preparation"]["id-column"]) if  config["preparation"]["format"] == "custom" else ""
+    "--id-column " + str(config["preparation"]["id-column"])
+    if config["preparation"]["format"] == "custom"
+    else ""
 )
 
 shell(

From cfa36acbfaa21a3080ddfd9bfbf9912ecd221353 Mon Sep 17 00:00:00 2001
From: CuongPham <50145360+giacuong171@users.noreply.github.com>
Date: Tue, 4 Jun 2024 15:18:59 +0200
Subject: [PATCH 08/23] Update
 snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile

Co-authored-by: Till Hartmann <till.hartmann@udo.edu>
---
 .../workflows/somatic_neoepitope_prediction/Snakefile           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
index 7fbb4bf6d..975acb83c 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""CUBI Pipeline tumor mutational burden step Snakefile"""
+"""CUBI Pipeline somatic neoepitope prediction step Snakefile"""
 
 import os
 

From 8d9155d4a66deee70a4b5815e0ccf62e7dbe3bf4 Mon Sep 17 00:00:00 2001
From: CuongPham <50145360+giacuong171@users.noreply.github.com>
Date: Tue, 4 Jun 2024 15:19:18 +0200
Subject: [PATCH 09/23] Update
 snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py

Co-authored-by: Till Hartmann <till.hartmann@udo.edu>
---
 .../workflows/somatic_neoepitope_prediction/__init__.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index a397b2401..5048012a9 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -347,7 +347,7 @@ def _yield_result_files_matched(self, tpl, **kwargs):
                 )
 
     def check_config(self):
-        """Check that the path to the NGS mapping is present"""
+        """Check that path_somatic_variant_annotation, preparation/mode and preparation/format are present in the configuration"""
         self.ensure_w_config(
             (
                 "step_config",

From 40cb936e2fac9854cc2cd830fdc474307d228c9c Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Tue, 18 Jun 2024 22:04:20 +0200
Subject: [PATCH 10/23] Update preparation for somatic neoepitope prediction

---
 .../somatic_neoepitope_prediction/Snakefile   |   4 +-
 .../somatic_neoepitope_prediction/__init__.py | 184 +++--
 .../wrappers/pvactools/combining/comb_rna.py  | 645 +++++++++---------
 .../pvactools/combining/environment.yaml      |  12 +-
 .../wrappers/pvactools/combining/wrapper.py   |  61 +-
 5 files changed, 428 insertions(+), 478 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
index 7fbb4bf6d..326510b4e 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""CUBI Pipeline tumor mutational burden step Snakefile"""
+"""CUBI Pipeline somatic neoepitope prediction step Snakefile"""
 
 import os
 
@@ -59,5 +59,7 @@ rule neoepitope_preparation:
         memory=wf.get_resource("neoepitope_preparation", "run", "memory"),
         partition=wf.get_resource("neoepitope_preparation", "run", "partition"),
         tmpdir=wf.get_resource("neoepitope_preparation", "run", "tmpdir"),
+    params:
+        **{"args": wf.get_params("neoepitope_preparation", "run")},
     wrapper:
         wf.wrapper_path("pvactools/combining")
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index a397b2401..a252eb961 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -59,24 +59,22 @@
   somatic_neoepitope_prediction:
     path_somatic_variant_annotation: ../somatic_variant_annotation  # REQUIRED
     path_rna_ngs_mapping: ../ngs_mapping
-    tools_somatic_variant_annotation: []
+    tools_somatic_variant_annotation: ["vep"]
     tools_rna_mapping: [] # deafult to those configured for ngs_mapping
     tools_ngs_mapping: [] # deafult to those configured for ngs_mapping
     tools_somatic_variant_calling: [] # deafult to those configured for somatic_variant_calling
-    max_depth: "4000"
     preparation:
-      format: 'star' # REQUIRED - The file format of the expression file to process. (stringtie,kallisto,cufflinks,custom)
+      format: 'snappy_custom' # REQUIRED - The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)
       # Use `custom` to process file formats not explicitly supported.
       # The `custom` option requires the use of the --id-column and --expression-column arguments.
-      path_features: ''
-      mode: 'gene'  # REQUIRED
-      id-column: '' # The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.
-      expression-column: 'fr' # REQUIRED
-                              # The column header in the expression_file for
-                              # the column containing expression data.
-                              # Required when using the `custom` and `star` format. For `star`, there are
-                              # only 3 options [unstranded,rf,fr]
-      ignore-ensembl-id-version: True #Ignore the ensemble id version
+      expression_file: '' # expression file path. If the format is not snappy_custom
+      path_features: '' # Gencode file path, required for star and snappy format
+      mode: 'gene'  # REQUIRED - Determine whether the expression file contains gene or transcript TPM values.
+      full_vep_annotation: True # The somatic_variant_annotation has been done in fully annotation mode
+      id_column:  # Gene/Transcript id column name. Need when using the `custom` format.
+      expression_column:  # Expression column name. Need when using the `custom` format.
+      ignore_ensembl_id_version: True #Ignore the ensemble id version
+      max_depth: 4000
 """
 
 
@@ -113,50 +111,46 @@ def get_input_files(self, action):
             "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/"
             "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
         )
-        # Need to change for work on many different tools
-        key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"}
+        if self.config["preparation"]["full_vep_annotation"]:
+            key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"}
+        else:
+            key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
         variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"]
         ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
         for key, ext in key_ext.items():
             yield key, variant_annotation(tpl + ext)
         # Getting appropriate rna library
-        for sheet in filter(is_not_background, self.parent.sheets):
-            for donor in sheet.bio_entities.values():
-                for biosample in donor.bio_samples.values():
-                    if biosample.extra_infos["isTumor"]:
-                        for test_sample in biosample.test_samples.values():
-                            if test_sample.extra_infos["extractionType"] == "RNA":
-                                for lib in test_sample.ngs_libraries.values():
-                                    rna_library = lib.name
-                                    rna_tpl = (
-                                        "output/{mapper}.{library_name}/out/{mapper}.{library_name}"
-                                    ).format(
-                                        mapper="star",
-                                        library_name=rna_library,
-                                    )
-                                    ext = {"expression", "bam", "bai"}
-                                    yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab")
-                                    yield "bam", ngs_mapping(rna_tpl + ".bam")
-                                    yield "bai", ngs_mapping(rna_tpl + ".bam.bai")
+        if self.config["preparation"]["format"] == "snappy_custom":
+            for sheet in filter(is_not_background, self.parent.sheets):
+                for donor in sheet.bio_entities.values():
+                    for biosample in donor.bio_samples.values():
+                        if biosample.extra_infos["isTumor"]:
+                            for test_sample in biosample.test_samples.values():
+                                if test_sample.extra_infos["extractionType"] == "RNA":
+                                    for lib in test_sample.ngs_libraries.values():
+                                        rna_library = lib.name
+                                        rna_tpl = (
+                                            "output/{mapper}.{library_name}/out/{mapper}.{library_name}"
+                                        ).format(
+                                            mapper="star",
+                                            library_name=rna_library,
+                                        )
+                                        ext = {"expression", "bam", "bai"}
+                                        yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab")
+                                        yield "bam", ngs_mapping(rna_tpl + ".bam")
+                                        yield "bai", ngs_mapping(rna_tpl + ".bam.bai")
 
     @dictify
     def get_output_files(self, action):
         """Return output files"""
         # Validate action
-        # Need to add step for adding RAD also
         self._validate_action(action)
-        if (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
-            == "gene"
-        ):
+        if self.config["preparation"]["mode"] == "gene":
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
-        elif (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
-            == "transcript"
-        ):
+        elif self.config["preparation"]["mode"] == "transcript":
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/"
                 "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
@@ -171,18 +165,12 @@ def _get_log_file(self, action):
         """Return mapping of log files."""
         # Validate action
         self._validate_action(action)
-        if (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
-            == "gene"
-        ):
+        if self.config["preparation"]["mode"] == "gene":
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
-        elif (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
-            == "transcript"
-        ):
+        elif self.config["preparation"]["mode"] == "transcript":
             prefix = (
                 "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/"
                 "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
@@ -204,11 +192,22 @@ def get_resource_usage(self, action):
             memory=f"{mem_mb}M",
         )
 
+    def get_params(self, action):
+        self._validate_action(action)
+        return {
+            "max_depth": self.config["preparation"]["max_depth"],
+            "format": self.config["preparation"]["format"],
+            "expression_column": self.config["preparation"]["expression_column"],
+            "id_column": self.config["preparation"]["id_column"],
+            "ignore_ensembl_id_version": self.config["preparation"]["ignore_ensembl_id_version"],
+            "mode": self.config["preparation"]["mode"],
+        }
+
 
 class SomaticNeoepitopePredictionWorkflow(BaseStep):
     """Perform neoepitope prediction workflow"""
 
-    name = "neoepitope_prediction"
+    name = "somatic_neoepitope_prediction"
     sheet_shortcut_class = CancerCaseSheet
     sheet_shortcut_kwargs = {
         "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
@@ -236,79 +235,50 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
         self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart))
         self.register_sub_workflow(
             "somatic_variant_annotation",
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "path_somatic_variant_annotation"
-            ],
+            self.config["path_somatic_variant_annotation"],
         )
         self.register_sub_workflow(
             "ngs_mapping",
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"],
+            self.config["path_rna_ngs_mapping"],
         )
         # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_ngs_mapping"
-            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "tools_somatic_variant_calling"
-        ]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_somatic_variant_calling"
-            ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"][
-            "tools_somatic_variant_annotation"
-        ]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_somatic_variant_annotation"
-            ] = ["vep"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_rna_mapping"
-            ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"]
-        if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
-            "path_features"
-        ]:
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][
-                "path_features"
-            ] = self.w_config["static_data_config"]["features"]["path"]
+        if not self.config["tools_ngs_mapping"]:
+            self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][
+                "dna"
+            ]
+        if not self.config["tools_somatic_variant_calling"]:
+            self.config["tools_somatic_variant_calling"] = self.w_config["step_config"][
+                "somatic_variant_calling"
+            ]["tools"]
+        if not self.config["tools_somatic_variant_annotation"]:
+            self.config["tools_somatic_variant_annotation"] = ["vep"]
+        if not self.config["tools_rna_mapping"]:
+            self.config["tools_rna_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][
+                "rna"
+            ]
+        if not self.config["preparation"]["path_features"]:
+            self.config["preparation"]["path_features"] = self.w_config["static_data_config"][
+                "features"
+            ]["path"]
 
     @listify
     def get_result_files(self):
-        callers = set(
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_somatic_variant_calling"
-            ]
-        )
-        anno_callers = set(
-            self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_somatic_variant_annotation"
-            ]
-        )
-        if (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
-            == "gene"
-        ):
+        callers = set(self.config["tools_somatic_variant_calling"])
+        anno_callers = set(self.config["tools_somatic_variant_annotation"])
+        if self.config["preparation"]["mode"] == "gene":
             name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}"
-        elif (
-            self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"]
-            == "transcript"
-        ):
+        elif self.config["preparation"]["mode"] == "transcript":
             name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}"
-
         yield from self._yield_result_files_matched(
             os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
-            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_ngs_mapping"
-            ],
+            mapper=self.config["tools_ngs_mapping"],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
             ext=EXT_VALUES,
         )
         yield from self._yield_result_files_matched(
             os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
-            mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][
-                "tools_ngs_mapping"
-            ],
+            mapper=self.config["tools_ngs_mapping"],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
             ext=(
@@ -347,7 +317,7 @@ def _yield_result_files_matched(self, tpl, **kwargs):
                 )
 
     def check_config(self):
-        """Check that the path to the NGS mapping is present"""
+        """Check that path_somatic_variant_annotation, preparation/mode and preparation/format are present in the configuration"""
         self.ensure_w_config(
             (
                 "step_config",
@@ -364,5 +334,5 @@ def check_config(self):
 
         self.ensure_w_config(
             ("step_config", "somatic_neoepitope_prediction", "preparation", "format"),
-            "Format is required for adding ",
+            "Format is required to determine which tool generated the TPM values. ",
         )
diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
index b28d3eedc..b8017b5d3 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -5,225 +5,148 @@
 
 import pandas as pd
 import pyranges as pr
-import vcfpy
+from vcfpy import OrderedDict, Reader, Writer
 
 
-########################## Reading vcf files ##########################
-def create_vcf_reader(args):
-    vcf_reader = vcfpy.Reader.from_path(args.input_vcf)
-    if args.rna_vcf:
-        rna_vcf_reader = vcfpy.Reader.from_path(args.rna_vcf)
-    else:
-        rna_vcf_reader = None
-    # expression part
-    is_multi_sample = len(vcf_reader.header.samples.names) > 1
-    if is_multi_sample and args.sample_name is None:
-        vcf_reader.close()
-        raise Exception(
-            "ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format(
-                args.input_vcf
-            )
-        )
-    elif is_multi_sample and args.sample_name not in vcf_reader.header.samples.names:
-        vcf_reader.close()
-        raise Exception(
-            "ERROR: VCF {} does not contain a sample column for sample {}.".format(
-                args.input_vcf, args.sample_name
-            )
-        )
-    if "CSQ" not in vcf_reader.header.info_ids():
-        vcf_reader.close()
-        raise Exception(
-            "ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format(
-                args.input_vcf
-            )
-        )
-    if args.mode == "gene" and "GX" in vcf_reader.header.format_ids():
-        vcf_reader.close()
-        raise Exception(
-            "ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format(
-                args.input_vcf
-            )
-        )
-    elif args.mode == "transcript" and "TX" in vcf_reader.header.format_ids():
-        vcf_reader.close()
-        raise Exception(
-            "ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format(
-                args.input_vcf
-            )
-        )
-    # snv part
-
-    return vcf_reader, rna_vcf_reader, is_multi_sample
+########################## Define tool parameters ##########################
+def define_parser():
+    parser = argparse.ArgumentParser(
+        "comb_rna.py",
+        description="A tool that will add the data from several expression tools' output files"
+        + "and allelic depths from mRNA sequencing data for snvs"
+        + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star, snappy_custom"
+        + "and Cufflinks. There also is a ``custom`` option to annotate with data "
+        + "from any tab-delimited file.",
+    )
 
+    parser.add_argument("--input-vcf", help="A VEP-annotated VCF file")
+    parser.add_argument("--expression-file", help="A expression file")
+    parser.add_argument("--genecode", help="A genecode file for calculate TPM from star gene count")
+    parser.add_argument(
+        "--rna-vcf",
+        help="A VCF file of RNA-seq data",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["kallisto", "stringtie", "cufflinks", "star", "snappy_custom", "custom"],
+        help="The file format of the expression file to process. "
+        + "Use `custom` to process file formats not explicitly supported. "
+        + "The `custom` option requires the use of the --id-column and --expression-column arguments.",
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["gene", "transcript"],
+        help="The type of expression data in the expression_file",
+    )
+    parser.add_argument(
+        "-i",
+        "--id-column",
+        help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.",
+    )
+    parser.add_argument(
+        "-e",
+        "--expression-column",
+        help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format.",
+    )
+    parser.add_argument(
+        "-s",
+        "--sample-name",
+        help="If the input_vcf contains multiple samples, the name of the sample to annotate.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-vcf",
+        help="Path to write the output VCF file. If not provided, the output VCF file will be "
+        + "written next to the input VCF file with a .tx.vcf or .gx.vcf file ending.",
+    )
+    parser.add_argument(
+        "--ignore-ensembl-id-version",
+        help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").',
+        action="store_true",
+    )
 
-########################## Adding extra fields into Format ##########################
-def create_vcf_writer(args, vcf_reader):
-    (head, sep, tail) = args.input_vcf.rpartition(".vcf")
-    new_header = vcf_reader.header.copy()
-    if args.mode == "gene":
-        new_header.add_format_line(
-            vcfpy.OrderedDict(
-                [
-                    ("ID", "GX"),
-                    ("Number", "."),
-                    ("Type", "Float"),
-                    ("Description", "Gene Expressions"),
-                ]
-            )
-        )
-        output_file = ("").join([head, ".gx.vcf", tail])
-    elif args.mode == "transcript":
-        new_header.add_format_line(
-            vcfpy.OrderedDict(
-                [
-                    ("ID", "TX"),
-                    ("Number", "."),
-                    ("Type", "Float"),
-                    ("Description", "Transcript Expressions"),
-                ]
-            )
-        )
-        output_file = ("").join([head, ".tx.vcf", tail])
-    if args.rna_vcf:
-        new_header.add_format_line(
-            vcfpy.OrderedDict(
-                [
-                    ("ID", "RAD"),
-                    ("Number", "."),
-                    ("Type", "Integer"),
-                    (
-                        "Description",
-                        "Allelic depths from mRNA sequencing data of the same sample",
-                    ),
-                ]
-            )
-        )
-        new_header.add_format_line(
-            vcfpy.OrderedDict(
-                [
-                    ("ID", "ROT"),
-                    ("Number", "."),
-                    ("Type", "Integer"),
-                    (
-                        "Description",
-                        "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample",
-                    ),
-                ]
-            )
-        )
-        (head, sep, tail) = output_file.rpartition(".vcf")
-        output_file = ("").join([head, ".pileup.vcf", tail])
-    if args.output_vcf:
-        output_file = args.output_vcf
-    return vcfpy.Writer.from_path(output_file, new_header)
+    return parser
 
 
-########################## Auto assign gene id column based on different tools ##########################
-def resolve_id_column(args):
-    if args.format == "cufflinks":
-        return "tracking_id"
-    elif args.format == "kallisto":
-        if args.mode == "gene":
-            return "gene"
-        elif args.mode == "transcript":
-            return "target_id"
-    elif args.format == "stringtie":
-        if args.mode == "gene":
-            return "Gene ID"
-    elif args.format == "star":
-        return "gene_id"  # NEED change
-    elif args.format == "custom":
+########################## Arguments check ##########################
+def args_check(args):
+    if args.format == "custom":
         if args.id_column is None:
             raise Exception(
-                "ERROR: `--id-column` option is required when using the `custom` format"
+                "--id-column is not set. This is required when using the `custom` format."
             )
-        else:
-            return args.id_column
-
-
-########################## Auto assign expression column based on different tools ##########################
-def resolve_expression_column(args):
-    if args.format == "cufflinks":
-        return "FPKM"
-    elif args.format == "kallisto":
-        if args.mode == "gene":
-            return "abundance"
-        elif args.mode == "transcript":
-            return "tpm"
-    elif args.format == "stringtie":
-        return "TPM"
-    elif args.format == "star":
-        args.expression_column = "read_count"
-        return "TPM_" + args.expression_column
-    elif args.format == "custom":
         if args.expression_column is None:
             raise Exception(
-                "ERROR: `--expression-column` option is required when using the `custom` format"
+                "--expression-column is not set. This is required when using the `custom` format."
             )
-        else:
-            return args.expression_column
-
-
-def to_array(dictionary):
-    array = []
-    for key, value in dictionary.items():
-        array.append("{}|{}".format(key, value))
-    return sorted(array)
+    if args.format == "star":
+        if args.genecode is None:
+            raise Exception("--genecode is not set. This is required when using the `star` format")
 
 
-def resolve_stringtie_id_column(args, headers):
-    if "gene_id" in headers:
-        return "gene_id"
-    else:
-        return "transcript_id"
-
-
-def genecount_reader(genecount_path):
-    col_name = ["gene_id", "read_count"]
-    star_genecount = pd.read_csv(
-        genecount_path,
-        skiprows=4,
-        sep="\t",
-        names=col_name,
-        header=None,
-    ).sort_values(by=["gene_id"])
-    # star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0])
-    star_genecount.read_count = star_genecount.read_count.astype(int, copy=None)
-    # star_genecount.rf = star_genecount.rf.astype(int, copy=None)
-    # star_genecount.fr = star_genecount.fr.astype(int, copy=None)
-    # star_genecount.set_index("gene_id")
-    return star_genecount
-
-
-def genecode_reader(genecode_path, star_genecount):
-    expression_column = "read_count"
+########################## Get expected column names ##########################
+def get_expected_column_names(format, mode, id_column, exp_column):
+    match (format, mode):
+        case ("star", _):
+            try:
+                exp = int(exp_column)
+                if 1 < exp and exp <= 4:
+                    return [0, exp - 1]
+                else:
+                    raise Exception("Expression column is invalid")
+            except ValueError:
+                raise ValueError("Expression column in star should be a number")
+        case ("snappy_custom", _):
+            return [0, 1]
+        case ("kallisto", "gene"):
+            return ["gene", "abundance"]
+        case ("kallisto", "transcript"):
+            return ["target_id", "tpm"]
+        case ("cufflinks", _):
+            return ["tracking_id", "FPKM"]
+        case ("stringtie", "gene"):
+            return ["Gene ID", "TPM"]
+        case ("stringtie", "transcript"):
+            return ["transcript_id", "TPM"]
+        case ("custom", _):
+            return [id_column, exp_column]
+        case _:
+            raise Exception("We don't support {format} with mode as {mode} yet.")
+
+
+########################## Genecode for star parser ##########################
+def count_to_TPM(genecode_path, exp_df, no_version=False):
     gc = pr.read_gtf(genecode_path, as_df=True)
-    gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])]
+    if no_version:
+        gc.loc[:, "gene_id"] = gc.loc[:, "gene_id"].apply(lambda x: re.sub(r"\.[0-9]+$", "", x))
+    gc = gc[gc["gene_id"].isin(exp_df.iloc[:, 0])]
     gc = gc[(gc.Feature == "gene")]
     exon = gc[["Chromosome", "Start", "End", "gene_id", "gene_name"]]
     # Convert columns to proper types.
-    exon.Start = exon.Start.astype(int)
-    exon.End = exon.End.astype(int)
+    exon.loc[:, "Start"] = exon.loc[:, "Start"].astype(int)
+    exon.loc[:, "End"] = exon.loc[:, "End"].astype(int)
     # Sort in place.
-    exon.sort_values(by=["Chromosome", "Start", "End"], inplace=True)
-    # Group the rows by the Ensembl gene identifier (with version numbers.)
+    exon = exon.sort_values(by=["Chromosome", "Start", "End"])
+    # Group the rows by the Ensembl gene identifier.
     groups = exon.groupby("gene_id")
-
     lengths = groups.apply(count_bp)
     # Create a new DataFrame with gene lengths and EnsemblID.
-    ensembl_no_version = lengths.index
-    ldf = pd.DataFrame({"length": lengths.values, "gene_id": ensembl_no_version})
-    star_tpm = pd.merge(ldf, star_genecount, how="inner", on="gene_id")
-    star_tpm["TPM_" + expression_column] = calculate_TPM(star_tpm, expression_column)
-    return star_tpm
+    gene_id = lengths.index
+    if len(gene_id) != len(exp_df):
+        raise Exception("Duplicated gene id in expression file")
+    ldf = pd.DataFrame({"length": lengths.values, "gene_id": gene_id}).sort_values(by="gene_id")
+    # Calculate TPM
+    tpm = calculate_TPM(exp_df, ldf)
+    tpm_dict = {k: v for k, v in zip(exp_df.iloc[:, 0], tpm)}
+    return tpm_dict
 
 
-def calculate_TPM(df, strand):
-    effLen = (df["length"]) / 1000
-    rate = df[strand] / effLen
-    denom = sum(rate) / (10**6)
-    return rate / denom
+def calculate_TPM(exp_df, ldf):
+    "TPM = (reads_transcript / transcript_length_kb) / scaled_total_mapped_reads"
+    transcript_length_kb = ldf["length"] / 1000
+    RPK = exp_df.iloc[:, 1] / transcript_length_kb
+    scale_factor_permil = sum(RPK) / (10**6)
+    return RPK / scale_factor_permil
 
 
 def count_bp(df):
@@ -258,47 +181,113 @@ def count_bp(df):
     return sum(bp)
 
 
-def parse_expression_file(args, vcf_reader, vcf_writer):
-    if args.format == "stringtie" and args.mode == "transcript":
-        df_all = pr.read_gtf(args.expression_file)
-        df = df_all[df_all["feature"] == "transcript"]
-        id_column = resolve_stringtie_id_column(args, df.columns.values)
-    elif args.format == "star":
-        id_column = resolve_id_column(args)
-        star_genecount = genecount_reader(args.expression_file)
-        df = genecode_reader(args.genecode, star_genecount)
+########################## Expression file parser ##########################
+def colnames_check(names, exp_names):
+    return set(names).issubset(set(exp_names))
+
+
+def expression_file_parser(
+    path, format, mode, id_column, exp_column, no_version=False, genecode=""
+):
+    col_names = get_expected_column_names(format, mode, id_column, exp_column)
+    if isinstance(col_names[0], int):
+        exp_df = pd.read_csv(path, skiprows=4, sep="\t", header=None)
+        exp_df = exp_df.iloc[:, col_names]
+        exp_df = exp_df.sort_values(by=0)
+        if no_version:
+            exp_df.iloc[:, 0] = exp_df.iloc[:, 0].apply(lambda x: re.sub(r"\.[0-9]+$", "", x))
+        tpm_dict = count_to_TPM(genecode, exp_df, no_version)
     else:
-        id_column = resolve_id_column(args)
-        df = pd.read_csv(args.expression_file, sep="\t")
-    if args.ignore_ensembl_id_version:
-        df["transcript_without_version"] = df[id_column].apply(
-            lambda x: re.sub(r"\.[0-9]+$", "", x)
+        if format == "stringtie" and mode == "transcript":
+            exp_df = pr.read_gtf(path, as_df=True)
+            exp_df = exp_df[exp_df["feature"] == "transcript"]
+        else:
+            exp_df = pd.read_csv(path, sep="\t", header=0)
+
+        if colnames_check(col_names, exp_df.columns):
+            exp_df = exp_df.loc[:, col_names]
+        else:
+            raise Exception(
+                "Gene id column or expression column is not presense in expression file"
+            )
+
+        if no_version:
+            exp_df.iloc[:, 0] = exp_df.iloc[:, 0].apply(lambda x: re.sub(r"\.[0-9]+$", "", x))
+        tpm_dict = {k: v for k, v in zip(exp_df.iloc[:, 0], exp_df.iloc[:, 1])}
+    return tpm_dict
+
+
+########################## Dna Vcf parser ##########################
+def dna_vcf_parser(path, sample_name):
+    vcf_reader = Reader.from_path(path)
+    is_multi_sample = len(vcf_reader.header.samples.names) > 1
+    if is_multi_sample and sample_name is None:
+        vcf_reader.close()
+        raise Exception(
+            "ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format(
+                path
+            )
         )
-    expression_column = resolve_expression_column(args)
-    if expression_column not in df.columns.values:
+    elif is_multi_sample and sample_name not in vcf_reader.header.samples.names:
         vcf_reader.close()
-        vcf_writer.close()
         raise Exception(
-            "ERROR: expression_column header {} does not exist in expression_file {}".format(
-                expression_column, args.expression_file
+            "ERROR: VCF {} does not contain a sample column for sample {}.".format(
+                path, sample_name
             )
         )
-    if id_column not in df.columns.values:
+    if "CSQ" not in vcf_reader.header.info_ids():
         vcf_reader.close()
-        vcf_writer.close()
         raise Exception(
-            "ERROR: id_column header {} does not exist in expression_file {}".format(
-                id_column, args.expression_file
+            "ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format(
+                path
             )
         )
-    if args.ignore_ensembl_id_version:
-        df[id_column] = df[id_column].apply(lambda x: re.sub(r"\.[0-9]+$", "", x))
-    exp_df = df[[id_column, expression_column]]
-    exp_dict = exp_df.set_index(id_column)[expression_column].to_dict()
-    return exp_dict
+    return vcf_reader, is_multi_sample
+
+
+########################## Rna Vcf parser ##########################
+def rna_vcf_parser(path):
+    rna_vcf_reader = Reader.from_path(path)
+    return rna_vcf_reader
+
+
+########################## Add expression values functions ##########################
+def to_array(dictionary):
+    array = []
+    for key, value in dictionary.items():
+        array.append("{}|{}".format(key, value))
+    return sorted(array)
+
+
+def add_Expression_to_vcf(
+    entry,
+    is_multi_sample,
+    sample_name,
+    exp_dict,
+    genes,
+    tag,
+    no_version,
+    missing_expressions_count,
+    entry_count,
+):
+    expressions = {}
+    for gene in genes:
+        entry_count += 1
+        if no_version:
+            gene = re.sub(r"\.[0-9]+$", "", gene)
+        if gene in exp_dict:
+            expressions[gene] = exp_dict[gene]
+        else:
+            missing_expressions_count += 1
+    if is_multi_sample:
+        entry.FORMAT += [tag]
+        entry.call_for_sample[sample_name].data[tag] = to_array(expressions)
+    else:
+        entry.add_format(tag, to_array(expressions))
+    return (entry, missing_expressions_count, entry_count)
 
 
-def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf):
+def add_AD_to_vcf(entry, is_multi_sample, sample_name, rna_vcf):
     RAD_temp = []
     ROT = []
     try:
@@ -331,132 +320,107 @@ def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf):
             entry.FORMAT += ["RAD"]
             entry.call_for_sample[sample_name].data["RAD"] = RAD_temp
             entry.FORMAT += ["ROT"]
-            entry.call_for_sample[sample_name].data["ROT"] = ROT
+            entry.call_for_sample[sample_name].data["ROT"] = ROT_temp
         else:
             entry.add_format("RAD", RAD_temp)
-            entry.add_format("ROT", ROT)
+            entry.add_format("ROT", ROT_temp)
     return entry
 
 
-def add_expressions(
-    entry,
-    is_multi_sample,
-    sample_name,
-    exp_dict,
-    items,
-    tag,
-    ignore_ensembl_id_version,
-    missing_expressions_count,
-    entry_count,
-):
-    expressions = {}
-    for item in items:
-        entry_count += 1
-        if ignore_ensembl_id_version:
-            item = re.sub(r"\.[0-9]+$", "", item)
-        if item in exp_dict:
-            expressions[item] = exp_dict[item]
-        else:
-            missing_expressions_count += 1
-    if is_multi_sample:
-        entry.FORMAT += [tag]
-        entry.call_for_sample[sample_name].data[tag] = to_array(expressions)
-    else:
-        entry.add_format(tag, to_array(expressions))
-    return (entry, missing_expressions_count, entry_count)
-
-
-########################## Define tool parameters ##########################
-def define_parser():
-    parser = argparse.ArgumentParser(
-        "comb_rna.py",
-        description="A tool that will add the data from several expression tools' output files"
-        + "and allelic depths from mRNA sequencing data for snvs"
-        + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star"
-        + "and Cufflinks. There also is a ``custom`` option to annotate with data "
-        + "from any tab-delimited file.",
-    )
-
-    parser.add_argument("--input_vcf", help="A VEP-annotated VCF file")
-    parser.add_argument("--expression_file", help="A TSV file containing expression estimates")
-    parser.add_argument("--genecode", help="A genecode file for calculate TPM from star gene count")
-    parser.add_argument(
-        "--rna-vcf",
-        help="A VCF file of RNA-seq data",
-    )
-    parser.add_argument(
-        "--format",
-        choices=["kallisto", "stringtie", "cufflinks", "star", "custom"],
-        help="The file format of the expression file to process. "
-        + "Use `custom` to process file formats not explicitly supported. "
-        + "The `custom` option requires the use of the --id-column and --expression-column arguments.",
-    )
-    parser.add_argument(
-        "--mode",
-        choices=["gene", "transcript"],
-        help="The type of expression data in the expression_file",
-    )
-    parser.add_argument(
-        "-i",
-        "--id-column",
-        help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.",
-    )
-    parser.add_argument(
-        "-e",
-        "--expression-column",
-        help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format.",
-    )
-    parser.add_argument(
-        "-s",
-        "--sample-name",
-        help="If the input_vcf contains multiple samples, the name of the sample to annotate.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output-vcf",
-        help="Path to write the output VCF file. If not provided, the output VCF file will be "
-        + "written next to the input VCF file with a .tx.vcf or .gx.vcf file ending.",
-    )
-    parser.add_argument(
-        "--ignore-ensembl-id-version",
-        help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").',
-        action="store_true",
-    )
-
-    return parser
-
-
-def args_check(args):
-    if args.format == "custom":
-        if args.id_column is None:
-            raise Exception(
-                "--id-column is not set. This is required when using the `custom` format."
+########################## Add fields to header ##########################
+def create_vcf_writer(path, vcf_reader, mode, rna_vcf, output_vcf):
+    (head, _, tail) = path.rpartition(".vcf")
+    new_header = vcf_reader.header.copy()
+    if mode == "gene":
+        new_header.add_format_line(
+            OrderedDict(
+                [
+                    ("ID", "GX"),
+                    ("Number", "."),
+                    ("Type", "Float"),
+                    ("Description", "Gene Expressions"),
+                ]
             )
-        if args.expression_column is None:
-            raise Exception(
-                "--expression-column is not set. This is required when using the `custom` format."
+        )
+        output_file = ("").join([head, ".gx.vcf", tail])
+    elif mode == "transcript":
+        new_header.add_format_line(
+            OrderedDict(
+                [
+                    ("ID", "TX"),
+                    ("Number", "."),
+                    ("Type", "Float"),
+                    ("Description", "Transcript Expressions"),
+                ]
             )
-    if args.format == "star":
-        if args.genecode is None:
-            raise Exception("--genecode is not set. This is required when using the `star` format")
+        )
+        output_file = ("").join([head, ".tx.vcf", tail])
+    if rna_vcf:
+        new_header.add_format_line(
+            OrderedDict(
+                [
+                    ("ID", "RAD"),
+                    ("Number", "A"),
+                    ("Type", "Integer"),
+                    (
+                        "Description",
+                        "Allelic depths from mRNA sequencing data of the same sample",
+                    ),
+                ]
+            )
+        )
+        new_header.add_format_line(
+            OrderedDict(
+                [
+                    ("ID", "ROT"),
+                    ("Number", "1"),
+                    ("Type", "Integer"),
+                    (
+                        "Description",
+                        "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample",
+                    ),
+                ]
+            )
+        )
+        (head, _, tail) = output_file.rpartition(".vcf")
+        output_file = ("").join([head, ".pileup.vcf", tail])
+    if output_vcf:
+        output_file = output_vcf
+    return Writer.from_path(output_file, new_header)
 
 
+########################## Generate new vcf file ##########################
 def adding_extra_information(args):
-    (vcf_reader, rna_vcf_reader, is_multi_sample) = create_vcf_reader(args)
-    # Only get contigs from vcf input file
-    CONTIGS = [x.id for x in vcf_reader.header.get_lines("contig")]
+    dna_vcf, is_multi_sample = dna_vcf_parser(args.input_vcf, args.sample_name)
+    if args.rna_vcf:
+        rna_vcf = rna_vcf_parser(args.rna_vcf)
+    # CONTIGS = [x.id for x in dna_vcf.header.get_lines("contig")]
     format_pattern = re.compile("Format: (.*)")
     csq_format = (
-        format_pattern.search(vcf_reader.header.get_info_field_info("CSQ").description)
+        format_pattern.search(dna_vcf.header.get_info_field_info("CSQ").description)
         .group(1)
         .split("|")
     )
+    if args.ignore_ensembl_id_version:
+        no_version = True
+
+    vcf_writer = create_vcf_writer(
+        args.input_vcf, dna_vcf, args.mode, args.rna_vcf, args.output_vcf
+    )
+    exp_dict = expression_file_parser(
+        args.expression_file,
+        args.format,
+        args.mode,
+        args.id_column,
+        args.expression_column,
+        no_version,
+        args.genecode,
+    )
 
-    vcf_writer = create_vcf_writer(args, vcf_reader)
-    exp_dict = parse_expression_file(args, vcf_reader, vcf_writer)
     missing_expressions_count = 0
     entry_count = 0
-    for entry in vcf_reader:
+
+    for entry in dna_vcf:
         # Add expression data
         transcript_ids = set()
         genes = set()
@@ -478,37 +442,38 @@ def adding_extra_information(args):
         if args.mode == "gene":
             genes = list(genes)
             if len(genes) > 0:
-                (entry, missing_expressions_count, entry_count) = add_expressions(
+                (entry, missing_expressions_count, entry_count) = add_Expression_to_vcf(
                     entry,
                     is_multi_sample,
                     args.sample_name,
                     exp_dict,
                     genes,
                     "GX",
-                    args.ignore_ensembl_id_version,
+                    no_version,
                     missing_expressions_count,
                     entry_count,
                 )
         elif args.mode == "transcript":
             transcript_ids = list(transcript_ids)
             if len(transcript_ids) > 0:
-                (entry, missing_expressions_count, entry_count) = add_expressions(
+                (entry, missing_expressions_count, entry_count) = add_Expression_to_vcf(
                     entry,
                     is_multi_sample,
                     args.sample_name,
                     exp_dict,
                     transcript_ids,
                     "TX",
+                    no_version,
                     missing_expressions_count,
                     entry_count,
                 )
         # Add RAD and ROT
-        if rna_vcf_reader is not None:
-            if (entry.CHROM in CONTIGS) and (entry.is_snv()):
-                entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name, rna_vcf_reader)
+        if rna_vcf is not None:
+            if entry.is_snv():
+                entry = add_AD_to_vcf(entry, is_multi_sample, args.sample_name, rna_vcf)
         vcf_writer.write_record(entry)
 
-    vcf_reader.close()
+    dna_vcf.close()
     vcf_writer.close()
 
     if missing_expressions_count > 0:
@@ -519,10 +484,10 @@ def adding_extra_information(args):
         )
 
 
+########################## MAIN ##########################
 def main(args_input=sys.argv[1:]):
     parser = define_parser()
     args = parser.parse_args(args_input)
-    args_check(args)
     adding_extra_information(args)
 
 
diff --git a/snappy_wrappers/wrappers/pvactools/combining/environment.yaml b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml
index 2ea441ab8..47a97a8e3 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/environment.yaml
+++ b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml
@@ -2,9 +2,9 @@ channels:
     - conda-forge
     - bioconda
 dependencies:
-    - vcfpy
-    - numpy
-    - pyranges
-    - pandas
-    - python
-    - bcftools
+    - vcfpy>=0.12.3
+    - numpy>=1.25.2, <1.26
+    - pyranges>=0.0.129, <0.1
+    - pandas>=2.1.1, <2.2
+    - python>=3.10.8, <3.11
+    - bcftools>=1.17, <1.18
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index 3d698637f..86b15643f 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -19,13 +19,24 @@
     "comb_rna.py",
 )
 ensemble_id = (
-    "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else ""
+    "--ignore-ensembl-id-version" if snakemake.params.args["ignore_ensembl_id_version"] else ""
 )
-id_column = (
-    "--id-column " + str(config["preparation"]["id-column"])
-    if config["preparation"]["format"] == "custom"
+max_depth = snakemake.params.args["max_depth"]
+format = snakemake.params.args["format"]
+mode = snakemake.params.args["mode"]
+expression_column = snakemake.params.args["expression_column"]
+id_column = snakemake.params.args["id_column"]
+
+rna_vcf = (
+    f"--rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz "
+    if format == "snappy_custom"
     else ""
 )
+expression_file = (
+    f"--expression-file {snakemake.input.expression} "
+    if format == "snappy_custom"
+    else config["preparation"]["expression_file"]
+)
 
 shell(
     r"""
@@ -53,32 +64,34 @@
 conda list > {snakemake.log.conda_list}
 conda info > {snakemake.log.conda_info}
 
-# Getting region of SNVs only
-bcftools filter -i 'TYPE="snp"' {snakemake.input.vcf} | bcftools query -f '%CHROM\t%POS0\t%END\n' > $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed
+if [[ {format}=="snappy_custom" ]]; then
+    # Getting region of SNVs only
+    bcftools filter -i 'TYPE="snp"' {snakemake.input.vcf} | bcftools query -f '%CHROM\t%POS0\t%END\n' > $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed
 
-# Getting snvs from RNA sequencing data
-bcftools mpileup -Ov \
-    --annotate FORMAT/AD,FORMAT/DP \
-    -R $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed \
-    -f {snakemake.config[static_data_config][reference][path]} \
-    --per-sample-mF \
-    --max-depth 4000 \
-    --redo-BAQ -Oz \
-    -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \
-    {snakemake.input.bam}
-tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz
+    # Getting snvs from RNA sequencing data
+    bcftools mpileup -Ov \
+        --annotate FORMAT/AD,FORMAT/DP \
+        -R $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed \
+        -f {snakemake.config[static_data_config][reference][path]} \
+        --per-sample-mF \
+        --max-depth {max_depth} \
+        --redo-BAQ -Oz \
+        -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \
+        {snakemake.input.bam}
+    tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz
+fi
 
 #Need to add option for RNA vcf as well
-python3 -W ignore {preparation_vcf} \
+python3 {preparation_vcf} \
     --genecode {snakemake.config[static_data_config][features][path]} \
-    --input_vcf {snakemake.input.vcf} --format {config[preparation][format]} \
-    --rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \
-    --expression_file {snakemake.input.expression} \
-    --mode {config[preparation][mode]} \
-    -e {config[preparation][expression-column]} \
+    --input-vcf {snakemake.input.vcf} --format {format} \
+    --mode {mode} \
+    -e {expression_column} \
+    -i {id_column} \
     -s {snakemake.wildcards.tumor_library} \
     -o /dev/stdout \
-    "{id_column}" \
+    {expression_file}\
+    {rna_vcf}\
     "{ensemble_id}" \
 | bgzip -c > {snakemake.output.vcf}
 tabix -f {snakemake.output.vcf}

From 23101e82ea3a462b560135eb73626da756c02c53 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Tue, 18 Jun 2024 22:12:48 +0200
Subject: [PATCH 11/23] Update test

---
 .../test_workflows_somatic_neoepitope_prediction.py        | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
index 960aff0e3..eebb03de0 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
@@ -64,13 +64,8 @@ def minimal_config():
             tools_somatic_variant_calling: []
             max_depth: "4000"
             preparation:
-                format: 'star'
-                path_features: ''
+                format: 'snappy_custom'
                 mode: 'gene'
-                id-column: ''
-                expression-column: 'fr'
-                ignore-ensembl-id-version: True
-
         data_sets:
           first_batch:
             file: sheet.tsv

From 0920803c1aa1b3414bb45d8359366dfc83f57374 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Mon, 1 Jul 2024 14:39:39 +0200
Subject: [PATCH 12/23] Adding plugins option to VEP

---
 .../somatic_variant_annotation/__init__.py          |  2 ++
 .../wrappers/pvactools/combining/comb_rna.py        |  4 ++--
 snappy_wrappers/wrappers/vep/run/wrapper.py         | 13 +++++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
index 9b599887d..f37050337 100644
--- a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
+++ b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
@@ -144,6 +144,8 @@
       buffer_size: 1000
       output_options:
       - everything
+      plugins: []   #Two required plugins for running pvactool are Frameshift and Wildtype
+      plugins_dir: ""
 """
 
 
diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
index b8017b5d3..8b3658c14 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py
@@ -337,7 +337,7 @@ def create_vcf_writer(path, vcf_reader, mode, rna_vcf, output_vcf):
                 [
                     ("ID", "GX"),
                     ("Number", "."),
-                    ("Type", "Float"),
+                    ("Type", "String"),
                     ("Description", "Gene Expressions"),
                 ]
             )
@@ -349,7 +349,7 @@ def create_vcf_writer(path, vcf_reader, mode, rna_vcf, output_vcf):
                 [
                     ("ID", "TX"),
                     ("Number", "."),
-                    ("Type", "Float"),
+                    ("Type", "String"),
                     ("Description", "Transcript Expressions"),
                 ]
             )
diff --git a/snappy_wrappers/wrappers/vep/run/wrapper.py b/snappy_wrappers/wrappers/vep/run/wrapper.py
index 82c5f1534..eb7f4bd44 100644
--- a/snappy_wrappers/wrappers/vep/run/wrapper.py
+++ b/snappy_wrappers/wrappers/vep/run/wrapper.py
@@ -12,6 +12,15 @@
 vep_config = snakemake.config["step_config"][current_step]["vep"]
 pick_order = ",".join(vep_config["pick_order"])
 script_output_options = " ".join(["--" + x for x in vep_config["output_options"]])
+if vep_config["plugins"]:
+    plugins = " ".join(["--plugin " + x for x in vep_config["plugins"]])
+    if not vep_config["plugins_dir"]:
+        raise Exception("Please provide plugins directory if you want to use plugins")
+    else:
+        plugins_dir = "--dir_plugins " + vep_config["plugins_dir"]
+else:
+    plugins = ""
+    plugins_dir = ""
 
 full = snakemake.output.full if "full" in snakemake.output.keys() else ""
 
@@ -44,6 +53,8 @@
             echo --dir_cache {vep_config[cache_dir]}
         fi) \
         {script_output_options} \
+        {plugins} \
+        {plugins_dir} \
         --{vep_config[tx_flag]} \
         --fasta {snakemake.config[static_data_config][reference][path]} \
         --input_file {snakemake.input.vcf} --format vcf \
@@ -64,6 +75,8 @@
         echo --dir_cache {vep_config[cache_dir]}
     fi) \
     {script_output_options} \
+    {plugins} \
+    {plugins_dir} \
     --pick --pick_order {pick_order} \
     --{vep_config[tx_flag]} \
     --fasta {snakemake.config[static_data_config][reference][path]} \

From 0d95ea3652df8515894bab8bdf5b5090fb32b650 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 3 Jul 2024 16:02:29 +0200
Subject: [PATCH 13/23] Adding plugin options for vep

---
 snappy_pipeline/models/annotation.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py
index 468f0bf96..22d7d1a9c 100644
--- a/snappy_pipeline/models/annotation.py
+++ b/snappy_pipeline/models/annotation.py
@@ -35,4 +35,7 @@ class Vep(SnappyModel):
     ]
     num_threads: int = 8
     buffer_size: int = 1000
+    plugins: list[str] = []
+    """To use this option in VEP, you should download the plugin repository from the link https://github.com/Ensembl/VEP_plugins"""
+    plugins_dir: str = ""
     output_options: list[str] = ["everything"]

From 1d725626987a118b47922d9784ed2576dc1bd55c Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 3 Jul 2024 16:03:05 +0200
Subject: [PATCH 14/23] Reformat neoepitope prediction for new snappy version

---
 snappy_pipeline/workflow_model.py             |  2 +
 .../somatic_neoepitope_prediction/__init__.py | 63 ++++++-------------
 .../somatic_neoepitope_prediction/model.py    | 53 ++++++++++++++++
 3 files changed, 75 insertions(+), 43 deletions(-)
 create mode 100644 snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py

diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py
index aeda7682e..24c04210e 100644
--- a/snappy_pipeline/workflow_model.py
+++ b/snappy_pipeline/workflow_model.py
@@ -36,6 +36,7 @@
 from snappy_pipeline.workflows.somatic_variant_filtration.model import SomaticVariantFiltration
 from snappy_pipeline.workflows.somatic_variant_signatures.model import SomaticVariantSignatures
 from snappy_pipeline.workflows.somatic_wgs_cnv_calling.model import SomaticWgsCnvCalling
+from snappy_pipeline.workflows.somatic_neoepitope_prediction.model import SomaticNeoepitopePrediction
 from snappy_pipeline.workflows.somatic_wgs_sv_calling.model import SomaticWgsSvCalling
 from snappy_pipeline.workflows.sv_calling_targeted.model import SvCallingTargeted
 from snappy_pipeline.workflows.sv_calling_wgs.model import SvCallingWgs
@@ -113,6 +114,7 @@ class StepConfig(TypedDict, total=False):
     somatic_gene_fusion_calling: SomaticGeneFusionCalling
     somatic_hla_loh_calling: SomaticHlaLohCalling
     somatic_msi_calling: SomaticMsiCalling
+    somatic_neoepitope_prediction: SomaticNeoepitopePrediction
     somatic_purity_ploidy_estimate: SomaticPurityPloidyEstimate
     somatic_targeted_seq_cnv_calling: SomaticTargetedSeqCnvCalling
     somatic_variant_annotation: SomaticVariantAnnotation
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index a252eb961..8bb322f27 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -46,6 +46,8 @@
     SOMATIC_VARIANT_CALLERS_MATCHED,
     SomaticVariantCallingWorkflow,
 )
+from .model import SomaticNeoepitopePrediction as SomaticNeoepitopePredictionConfigModel
+
 
 __author__ = "Pham Gia Cuong"
 __email__ = "pham.gia-cuong@bih-charite.de"
@@ -54,28 +56,7 @@
 EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5")
 
 #: Default configuration for the somatic_gene_fusion_calling step
-DEFAULT_CONFIG = r"""
-step_config:
-  somatic_neoepitope_prediction:
-    path_somatic_variant_annotation: ../somatic_variant_annotation  # REQUIRED
-    path_rna_ngs_mapping: ../ngs_mapping
-    tools_somatic_variant_annotation: ["vep"]
-    tools_rna_mapping: [] # deafult to those configured for ngs_mapping
-    tools_ngs_mapping: [] # deafult to those configured for ngs_mapping
-    tools_somatic_variant_calling: [] # deafult to those configured for somatic_variant_calling
-    preparation:
-      format: 'snappy_custom' # REQUIRED - The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)
-      # Use `custom` to process file formats not explicitly supported.
-      # The `custom` option requires the use of the --id-column and --expression-column arguments.
-      expression_file: '' # expression file path. If the format is not snappy_custom
-      path_features: '' # Gencode file path, required for star and snappy format
-      mode: 'gene'  # REQUIRED - Determine whether the expression file contains gene or transcript TPM values.
-      full_vep_annotation: True # The somatic_variant_annotation has been done in fully annotation mode
-      id_column:  # Gene/Transcript id column name. Need when using the `custom` format.
-      expression_column:  # Expression column name. Need when using the `custom` format.
-      ignore_ensembl_id_version: True #Ignore the ensemble id version
-      max_depth: 4000
-"""
+DEFAULT_CONFIG = SomaticNeoepitopePredictionConfigModel.default_config_yaml_string()
 
 
 class NeoepitopePreparationStepPart(BaseStepPart):
@@ -183,7 +164,7 @@ def _get_log_file(self, action):
         for key, ext in key_ext:
             yield key, prefix + ext
 
-    def get_resource_usage(self, action):
+    def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage:
         self._validate_action(action)
         mem_mb = 6 * 1024  # 6GB
         return ResourceUsage(
@@ -225,41 +206,37 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             config_lookup_paths,
             config_paths,
             workdir,
-            (
+            config_model_class=SomaticNeoepitopePredictionConfigModel,
+            previous_steps=(
                 SomaticVariantCallingWorkflow,
                 SomaticVariantAnnotationWorkflow,
                 NgsMappingWorkflow,
             ),
         )
         # Register sub step classes so the sub steps are available
+        config = self.config
         self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart))
         self.register_sub_workflow(
             "somatic_variant_annotation",
-            self.config["path_somatic_variant_annotation"],
+            config["path_somatic_variant_annotation"],
         )
         self.register_sub_workflow(
             "ngs_mapping",
-            self.config["path_rna_ngs_mapping"],
+            config["path_rna_ngs_mapping"],
         )
         # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
-        if not self.config["tools_ngs_mapping"]:
-            self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][
-                "dna"
-            ]
-        if not self.config["tools_somatic_variant_calling"]:
-            self.config["tools_somatic_variant_calling"] = self.w_config["step_config"][
+        if not config.tools_ngs_mapping:
+            config.tools_ngs_mapping = self.w_config["step_config"]["ngs_mapping"].tools.dna
+        if not config.tools_somatic_variant_calling:
+            config.tools_somatic_variant_calling = self.w_config["step_config"][
                 "somatic_variant_calling"
-            ]["tools"]
-        if not self.config["tools_somatic_variant_annotation"]:
-            self.config["tools_somatic_variant_annotation"] = ["vep"]
-        if not self.config["tools_rna_mapping"]:
-            self.config["tools_rna_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][
-                "rna"
-            ]
-        if not self.config["preparation"]["path_features"]:
-            self.config["preparation"]["path_features"] = self.w_config["static_data_config"][
-                "features"
-            ]["path"]
+            ].tools
+        if not config.tools_somatic_variant_annotation:
+            config.tools_somatic_variant_annotation = ["vep"]
+        if not config.tools_rna_mapping:
+            config.tools_rna_mapping = self.w_config["step_config"]["ngs_mapping"].tools.rna
+        if not config.preparation.path_features:
+            config.preparation.path_features = self.w_config["static_data_config"].features.path
 
     @listify
     def get_result_files(self):
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
new file mode 100644
index 000000000..61b2ce8f1
--- /dev/null
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
@@ -0,0 +1,53 @@
+from typing import Annotated
+
+from pydantic import Field
+
+from snappy_pipeline.models import SnappyModel, SnappyStepModel
+
+# class Format:
+#     stringtie = "stringtie"
+#     snappy_custom = "snappy_custom"
+#     cufflinks = "cufflinks"
+#     kallisto = "kallisto"
+#     custom = "custom"
+
+
+class Preparation(SnappyModel):
+    format: Annotated[
+        str, Field(examples=["stringtie", "snappy_custom", "cufflinks", "kallisto", "custom"])
+    ] = "snappy_custom"
+    # Literal["stringtie","snappy_custom","cufflinks","kallisto","custom"]
+    """The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)
+    Use `custom` to process file formats not explicitly supported.
+    The `custom` option requires the use of the --id-column and --expression-column arguments."""
+    path_features: str = ""
+    """Gencode file path, required for star and snappy format"""
+    mode: Annotated[str, Field(examples=["gene", "transcript"])] = "gene"
+    """Determine whether the expression file contains gene or transcript TPM values."""
+    full_vep_annotation: bool = True
+    """The somatic_variant_annotation has been done in fully annotation mode"""
+    id_column: str = ""
+    """Gene/Transcript id column name. Need when using the `custom` format."""
+    expression_column: str = ""
+    """Expression column name. Need when using the `custom` format."""
+    ignore_ensembl_id_version: bool = True
+    """Ignore the ensemble id version"""
+    max_depth: int = 4000
+
+
+class SomaticNeoepitopePrediction(SnappyStepModel):
+    path_somatic_variant_annotation: Annotated[
+        str, Field(examples=["../somatic_variant_annotation"])
+    ]
+    path_rna_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])]
+    tools_somatic_variant_annotation: list[str] = ["vep"]
+    tools_ngs_mapping: list[str] = []
+    tools_somatic_variant_calling: list[str] = []
+    tools_rna_mapping: list[str] = []
+    tools_rna_mapping: list[str] = []
+    """Deafult to those configured for ngs_mapping"""
+    tools_ngs_mapping: list[str] = []
+    """Deafult to those configured for ngs_mapping"""
+    tools_somatic_variant_calling: list[str] = []
+    """Deafult to those configured for somatic_variant_calling"""
+    preparation: Preparation | None = None

From 6d9257c3b18fce6376fa8a9e6b03d426e64cf63e Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 3 Jul 2024 16:07:40 +0200
Subject: [PATCH 15/23] Satisfies lint

---
 snappy_pipeline/workflow_model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py
index 24c04210e..1b1a7fefd 100644
--- a/snappy_pipeline/workflow_model.py
+++ b/snappy_pipeline/workflow_model.py
@@ -36,7 +36,9 @@
 from snappy_pipeline.workflows.somatic_variant_filtration.model import SomaticVariantFiltration
 from snappy_pipeline.workflows.somatic_variant_signatures.model import SomaticVariantSignatures
 from snappy_pipeline.workflows.somatic_wgs_cnv_calling.model import SomaticWgsCnvCalling
-from snappy_pipeline.workflows.somatic_neoepitope_prediction.model import SomaticNeoepitopePrediction
+from snappy_pipeline.workflows.somatic_neoepitope_prediction.model import (
+    SomaticNeoepitopePrediction,
+)
 from snappy_pipeline.workflows.somatic_wgs_sv_calling.model import SomaticWgsSvCalling
 from snappy_pipeline.workflows.sv_calling_targeted.model import SvCallingTargeted
 from snappy_pipeline.workflows.sv_calling_wgs.model import SvCallingWgs

From 999c6ea591a7402ec875f88c91bd23de9b93f486 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Fri, 5 Jul 2024 10:51:42 +0200
Subject: [PATCH 16/23] Adding test for somatic neoepitope prediction
 preparation substep

---
 .../somatic_neoepitope_prediction/model.py    | 27 +++++++++----------
 .../wrappers/pvactools/combining/wrapper.py   | 11 +++++---
 ...workflows_somatic_neoepitope_prediction.py | 20 +++++++-------
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
index 61b2ce8f1..3ccf5080e 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
@@ -1,25 +1,24 @@
+import enum
 from typing import Annotated
 
 from pydantic import Field
 
-from snappy_pipeline.models import SnappyModel, SnappyStepModel
+from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel
 
-# class Format:
-#     stringtie = "stringtie"
-#     snappy_custom = "snappy_custom"
-#     cufflinks = "cufflinks"
-#     kallisto = "kallisto"
-#     custom = "custom"
+
+class Format(enum.StrEnum):
+    stringtie = "stringtie"
+    snappy_custom = "snappy_custom"
+    cufflinks = "cufflinks"
+    kallisto = "kallisto"
+    custom = "custom"
 
 
 class Preparation(SnappyModel):
-    format: Annotated[
-        str, Field(examples=["stringtie", "snappy_custom", "cufflinks", "kallisto", "custom"])
-    ] = "snappy_custom"
-    # Literal["stringtie","snappy_custom","cufflinks","kallisto","custom"]
-    """The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)
-    Use `custom` to process file formats not explicitly supported.
-    The `custom` option requires the use of the --id-column and --expression-column arguments."""
+    format: Annotated[Format, EnumField(Format)] = Format.snappy_custom
+    "The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)"
+    "Use `custom` to process file formats not explicitly supported."
+    "The `custom` option requires the use of the --id-column and --expression-column arguments."
     path_features: str = ""
     """Gencode file path, required for star and snappy format"""
     mode: Annotated[str, Field(examples=["gene", "transcript"])] = "gene"
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index 86b15643f..6e178ee51 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -24,8 +24,11 @@
 max_depth = snakemake.params.args["max_depth"]
 format = snakemake.params.args["format"]
 mode = snakemake.params.args["mode"]
-expression_column = snakemake.params.args["expression_column"]
-id_column = snakemake.params.args["id_column"]
+expression_column = (
+    f"--expression-file {snakemake.params.args['expression_column']}" if format == "custom" else ""
+)
+
+id_column = f"--expression-file {snakemake.params.args['id_column']}" if format == "custom" else ""
 
 rna_vcf = (
     f"--rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz "
@@ -86,8 +89,8 @@
     --genecode {snakemake.config[static_data_config][features][path]} \
     --input-vcf {snakemake.input.vcf} --format {format} \
     --mode {mode} \
-    -e {expression_column} \
-    -i {id_column} \
+    {expression_column} \
+    {id_column} \
     -s {snakemake.wildcards.tumor_library} \
     -o /dev/stdout \
     {expression_file}\
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
index eebb03de0..057308663 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
@@ -38,8 +38,7 @@ def minimal_config():
               dna: ['bwa']
               rna: ['star']
             star:
-              path_index: /path/to/index
-              path_features:
+              path_index: /path/to/star/index
             bwa:
               path_index: /path/to/bwa/index.fa
 
@@ -47,22 +46,20 @@ def minimal_config():
             tools:
             - mutect2
             - scalpel
+            mutect2: {}
             scalpel:
               path_target_regions: /path/to/target/regions.bed
 
           somatic_variant_annotation:
+            path_somatic_variant_calling: ../somatic_variant_calling
             tools: ["vep"]
             vep:
-              path_dir_cache: /path/to/dir/cache
+              cache_dir: /path/to/dir/cache
 
           somatic_neoepitope_prediction:
             path_somatic_variant_annotation: ../somatic_variant_annotation
             path_rna_ngs_mapping: ../ngs_mapping
-            tools_somatic_variant_annotation: []
-            tools_rna_mapping: []
-            tools_ngs_mapping: []
-            tools_somatic_variant_calling: []
-            max_depth: "4000"
+            tools_somatic_variant_annotation: ["vep"]
             preparation:
                 format: 'snappy_custom'
                 mode: 'gene'
@@ -87,12 +84,15 @@ def somatic_neoepitope_prediction_workflow(
     work_dir,
     config_paths,
     cancer_sheet_fake_fs,
+    aligner_indices_fake_fs,
     mocker,
 ):
     # Patch out file-system related things in abstract (the crawling link in step is defined there)
     patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker)
+    patch_module_fs("snappy_pipeline.workflows.ngs_mapping.model", aligner_indices_fake_fs, mocker)
     # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we
     # can obtain paths from the function as if we really had a NGSMappingPipelineStep there
+
     dummy_workflow.globals = {
         "ngs_mapping": lambda x: "NGS_MAPPING/" + x,
         "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x,
@@ -149,7 +149,7 @@ def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_predi
     assert actual == expected
 
 
-def test_neoepitope_preparation_step_part_get_resource_usage(
+def test_neoepitope_preparation_step_part_get_resource(
     somatic_neoepitope_prediction_workflow,
 ):
     # Define expected
@@ -159,7 +159,7 @@ def test_neoepitope_preparation_step_part_get_resource_usage(
         msg_error = f"Assertion error for resource '{resource}'."
         actual = somatic_neoepitope_prediction_workflow.get_resource(
             "neoepitope_preparation", "run", resource
-        )
+        )()
         assert actual == expected, msg_error
 
 

From a140e28ea4cb5b581f5fb109e4378f268c50177a Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 11 Jul 2024 10:29:47 +0200
Subject: [PATCH 17/23] Preparation for pvactool

---
 .../somatic_neoepitope_prediction/Snakefile   |  18 +--
 .../somatic_neoepitope_prediction/__init__.py | 106 ++++++++++++------
 .../somatic_neoepitope_prediction/model.py    |   2 +-
 .../wrappers/pvactools/combining/wrapper.py   |   7 +-
 4 files changed, 85 insertions(+), 48 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
index 326510b4e..de398ddc7 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
@@ -48,18 +48,18 @@ rule neoepitope_preparation_link_out_run:
 
 rule neoepitope_preparation:
     input:
-        **wf.get_input_files("neoepitope_preparation", "run"),
+        unpack(wf.get_input_files("pvacseq","prepare")),
     output:
-        **wf.get_output_files("neoepitope_preparation", "run"),
+        **wf.get_output_files("pvacseq","prepare"),
     log:
-        **wf.get_log_file("neoepitope_preparation", "run"),
-    threads: wf.get_resource("neoepitope_preparation", "run", "threads")
+        **wf.get_log_file("pvacseq","prepare"),
+    threads: wf.get_resource("pvacseq","prepare", "threads")
     resources:
-        time=wf.get_resource("neoepitope_preparation", "run", "time"),
-        memory=wf.get_resource("neoepitope_preparation", "run", "memory"),
-        partition=wf.get_resource("neoepitope_preparation", "run", "partition"),
-        tmpdir=wf.get_resource("neoepitope_preparation", "run", "tmpdir"),
+        time=wf.get_resource("pvacseq","prepare", "time"),
+        memory=wf.get_resource("pvacseq","prepare", "memory"),
+        partition=wf.get_resource("pvacseq","prepare", "partition"),
+        tmpdir=wf.get_resource("pvacseq","prepare", "tmpdir"),
     params:
-        **{"args": wf.get_params("neoepitope_preparation", "run")},
+        **{"args": wf.get_params("pvacseq","prepare")},
     wrapper:
         wf.wrapper_path("pvactools/combining")
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index 8bb322f27..cfb3bb82c 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -59,14 +59,14 @@
 DEFAULT_CONFIG = SomaticNeoepitopePredictionConfigModel.default_config_yaml_string()
 
 
-class NeoepitopePreparationStepPart(BaseStepPart):
+class PvacSeqStepPart(BaseStepPart):
     """
     Preparation VCF file for pvactool
     """
 
     #: Step name
-    name = "neoepitope_preparation"
-    actions = ("run",)
+    name = "pvacseq"
+    actions = ("prepare","predict")
 
     def __init__(self, parent):
         super().__init__(parent)
@@ -82,44 +82,74 @@ def __init__(self, parent):
             for donor in sheet.donors:
                 self.donors[donor.name] = donor
 
-    @dictify
     def get_input_files(self, action):
+        self._validate_action(action)
+        return self._get_input_files_prepare
+
+    # @dictify
+    # def _get_input_files_predict(self,wildcards):
+    #     prepare_tpl = (
+    #         "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/"
+    #         "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.vcf.gz"
+    #     )
+    #     yield "combine_vcf",prepare_tpl
+    #     hla_typing = self.parent.sub_workflows["hla_typing"]
+    #     hla_tpl = {
+    #         "output/{hla_tool}.{ngs_library}/out/{hla_tool}.{ngs_library}.txt"
+    #     }
+    #     for sheet in filter(is_not_background, self.parent.shortcut_sheets):
+    #         for sample_pair in sheet.all_sample_pairs:
+    #             if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library:
+    #                 yield "hla_normal_dna", hla_typing((hla_tpl).format(
+    #                                     hla_tool=self.w_config.step_config["hla_typing"].tools[0],
+    #                                     ngs_library=sample_pair.normal_sample.dna_ngs_library.name,)
+    #                                     )
+    #                 yield "hla_tumor_dna", hla_typing((hla_tpl).format(
+    #                                     hla_tool=self.w_config.step_config["hla_typing"].tools[0],
+    #                                     ngs_library=sample_pair.tumor_sample.dna_ngs_library.name,)
+    #                                     )
+    #                 yield "hla_tumor_rna", hla_typing((hla_tpl).format(
+    #                                     hla_tool=self.w_config.step_config["hla_typing"].tools[0],
+    #                                     ngs_library=sample_pair.tumor_sample.rna_ngs_library.name,)
+    #                                     )
+    #     pass
+
+    @dictify
+    def _get_input_files_prepare(self, wildcards):
         """Return path to somatic variant annotated file"""
         # It only works with vep now.
         # Validate action
-        self._validate_action(action)
+        # self._validate_action(action)
         tpl = (
             "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/"
             "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
         )
         if self.config["preparation"]["full_vep_annotation"]:
-            key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"}
+            vep_key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"}
         else:
-            key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
+            vep_key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
         variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"]
-        ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
-        for key, ext in key_ext.items():
-            yield key, variant_annotation(tpl + ext)
-        # Getting appropriate rna library
         if self.config["preparation"]["format"] == "snappy_custom":
-            for sheet in filter(is_not_background, self.parent.sheets):
-                for donor in sheet.bio_entities.values():
-                    for biosample in donor.bio_samples.values():
-                        if biosample.extra_infos["isTumor"]:
-                            for test_sample in biosample.test_samples.values():
-                                if test_sample.extra_infos["extractionType"] == "RNA":
-                                    for lib in test_sample.ngs_libraries.values():
-                                        rna_library = lib.name
-                                        rna_tpl = (
-                                            "output/{mapper}.{library_name}/out/{mapper}.{library_name}"
-                                        ).format(
-                                            mapper="star",
-                                            library_name=rna_library,
+            for key,ext in vep_key_ext.items():
+                yield key,variant_annotation(tpl+ext)
+
+        ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
+
+        star_key_ext = {"expression": ".GeneCounts.tab",
+                        "bam": ".bam"}
+        for sheet in filter(is_not_background, self.parent.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library:
+                    rna_tpl = (
+                        "output/{mapper}.{rna_library}/out/{mapper}.{rna_library}"
+                    )
+                    for key,ext in star_key_ext.items():
+                        yield key, ngs_mapping((rna_tpl + ext).format(
+                                        mapper=self.w_config.step_config["ngs_mapping"].tools.rna[0],
+                                        rna_library=sample_pair.tumor_sample.rna_ngs_library.name,)
                                         )
-                                        ext = {"expression", "bam", "bai"}
-                                        yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab")
-                                        yield "bam", ngs_mapping(rna_tpl + ".bam")
-                                        yield "bai", ngs_mapping(rna_tpl + ".bam.bai")
+
+                    
 
     @dictify
     def get_output_files(self, action):
@@ -128,12 +158,12 @@ def get_output_files(self, action):
         self._validate_action(action)
         if self.config["preparation"]["mode"] == "gene":
             prefix = (
-                "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
+                "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
         elif self.config["preparation"]["mode"] == "transcript":
             prefix = (
-                "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/"
+                "work/prepare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/"
                 "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
             )
         key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
@@ -148,12 +178,12 @@ def _get_log_file(self, action):
         self._validate_action(action)
         if self.config["preparation"]["mode"] == "gene":
             prefix = (
-                "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
+                "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
                 "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
             )
         elif self.config["preparation"]["mode"] == "transcript":
             prefix = (
-                "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/"
+                "work/preprare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/"
                 "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
             )
         key_ext = (
@@ -175,7 +205,10 @@ def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage:
 
     def get_params(self, action):
         self._validate_action(action)
-        return {
+        return getattr(self,"_get_params_run")
+    
+    def _get_params_run(self,wildcards):
+        d = {
             "max_depth": self.config["preparation"]["max_depth"],
             "format": self.config["preparation"]["format"],
             "expression_column": self.config["preparation"]["expression_column"],
@@ -183,6 +216,7 @@ def get_params(self, action):
             "ignore_ensembl_id_version": self.config["preparation"]["ignore_ensembl_id_version"],
             "mode": self.config["preparation"]["mode"],
         }
+        return d
 
 
 class SomaticNeoepitopePredictionWorkflow(BaseStep):
@@ -215,7 +249,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
         )
         # Register sub step classes so the sub steps are available
         config = self.config
-        self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart))
+        self.register_sub_step_classes((PvacSeqStepPart, LinkOutStepPart))
         self.register_sub_workflow(
             "somatic_variant_annotation",
             config["path_somatic_variant_annotation"],
@@ -247,14 +281,14 @@ def get_result_files(self):
         elif self.config["preparation"]["mode"] == "transcript":
             name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}"
         yield from self._yield_result_files_matched(
-            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
+            os.path.join("output/prepare", name_pattern, "out", name_pattern + "{ext}"),
             mapper=self.config["tools_ngs_mapping"],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
             ext=EXT_VALUES,
         )
         yield from self._yield_result_files_matched(
-            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
+            os.path.join("output/prepare", name_pattern, "log", name_pattern + "{ext}"),
             mapper=self.config["tools_ngs_mapping"],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
index 3ccf5080e..7abe6815f 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
@@ -38,12 +38,12 @@ class SomaticNeoepitopePrediction(SnappyStepModel):
     path_somatic_variant_annotation: Annotated[
         str, Field(examples=["../somatic_variant_annotation"])
     ]
+    path_hla_typing : Annotated[str, Field(examples=["../hla_typing"])]
     path_rna_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])]
     tools_somatic_variant_annotation: list[str] = ["vep"]
     tools_ngs_mapping: list[str] = []
     tools_somatic_variant_calling: list[str] = []
     tools_rna_mapping: list[str] = []
-    tools_rna_mapping: list[str] = []
     """Deafult to those configured for ngs_mapping"""
     tools_ngs_mapping: list[str] = []
     """Deafult to those configured for ngs_mapping"""
diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
index 6e178ee51..f5a48fdea 100644
--- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py
@@ -30,13 +30,16 @@
 
 id_column = f"--expression-file {snakemake.params.args['id_column']}" if format == "custom" else ""
 
+bam_file = snakemake.input.bam
+gencount_file = snakemake.input.expression
+
 rna_vcf = (
     f"--rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz "
     if format == "snappy_custom"
     else ""
 )
 expression_file = (
-    f"--expression-file {snakemake.input.expression} "
+    f"--expression-file {gencount_file} "
     if format == "snappy_custom"
     else config["preparation"]["expression_file"]
 )
@@ -80,7 +83,7 @@
         --max-depth {max_depth} \
         --redo-BAQ -Oz \
         -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \
-        {snakemake.input.bam}
+        {bam_file}
     tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz
 fi
 

From 03bd9e429272ad258a2aa88f1edd7886baabd1a9 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Thu, 11 Jul 2024 15:52:07 +0200
Subject: [PATCH 18/23] Fix HLA_typing pipeline

---
 .../workflows/hla_typing/Snakefile            | 86 +++++++++----------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/snappy_pipeline/workflows/hla_typing/Snakefile b/snappy_pipeline/workflows/hla_typing/Snakefile
index 0f627c46d..5b07da66d 100644
--- a/snappy_pipeline/workflows/hla_typing/Snakefile
+++ b/snappy_pipeline/workflows/hla_typing/Snakefile
@@ -53,7 +53,6 @@ rule hla_typing_link_in_run:
 
 # Generic linking out ---------------------------------------------------------
 
-
 rule hla_typing_link_out_run:
     input:
         wf.get_input_files("link_out", "run"),
@@ -67,51 +66,52 @@ rule hla_typing_link_out_run:
 
 # OptiType --------------------------------------------------------------------
 
-
-rule hla_typing_optitype_run:
-    input:
-        **wf.get_input_files("optitype", "run"),
-    output:
-        **wf.get_output_files("optitype", "run"),
-    threads: wf.get_resource("optitype", "run", "threads")
-    resources:
-        time=wf.get_resource("optitype", "run", "time"),
-        memory=wf.get_resource("optitype", "run", "memory"),
-        partition=wf.get_resource("optitype", "run", "partition"),
-        tmpdir=wf.get_resource("optitype", "run", "tmpdir"),
-    log:
-        **wf.get_log_file("optitype", "run"),
-    params:
-        args=wf.substep_dispatch("optitype", "get_args", "run"),
-    wrapper:
-        wf.wrapper_path("optitype")
+if "optitype" in wf.w_config.step_config["hla_typing"].tools:
+    rule hla_typing_optitype_run:
+        input:
+            **wf.get_input_files("optitype", "run"),
+        output:
+            **wf.get_output_files("optitype", "run"),
+        threads: wf.get_resource("optitype", "run", "threads")
+        resources:
+            time=wf.get_resource("optitype", "run", "time"),
+            memory=wf.get_resource("optitype", "run", "memory"),
+            partition=wf.get_resource("optitype", "run", "partition"),
+            tmpdir=wf.get_resource("optitype", "run", "tmpdir"),
+        log:
+            **wf.get_log_file("optitype", "run"),
+        params:
+            args=wf.substep_dispatch("optitype", "get_args", "run"),
+        wrapper:
+            wf.wrapper_path("optitype")
 
 
 # arcasHLA --------------------------------------------------------------------
 
 
 # NB: reference is updated in the installed package
-rule hla_typing_arcashla_prepare_reference:
-    output:
-        touch("work/arcashla.prepare_reference/out/.done"),
-    log:
-        "work/arcashla.prepare_reference/log/arcashla.prepare_reference.log",
-    wrapper:
-        wf.wrapper_path("arcashla/prepare_reference")
-
-
-rule hla_typing_arcashla_run:
-    input:
-        unpack(wf.get_input_files("arcashla", "run")),
-    output:
-        **wf.get_output_files("arcashla", "run"),
-    threads: wf.get_resource("arcashla", "run", "threads")
-    resources:
-        time=wf.get_resource("arcashla", "run", "time"),
-        memory=wf.get_resource("arcashla", "run", "memory"),
-        partition=wf.get_resource("arcashla", "run", "partition"),
-        tmpdir=wf.get_resource("arcashla", "run", "tmpdir"),
-    log:
-        wf.get_log_file("arcashla", "run"),
-    wrapper:
-        wf.wrapper_path("arcashla/run")
+if "arcashla" in wf.w_config.step_config["hla_typing"].tools:
+    rule hla_typing_arcashla_prepare_reference:
+        output:
+            touch("work/arcashla.prepare_reference/out/.done"),
+        log:
+            "work/arcashla.prepare_reference/log/arcashla.prepare_reference.log",
+        wrapper:
+            wf.wrapper_path("arcashla/prepare_reference")
+
+
+    rule hla_typing_arcashla_run:
+        input:
+            unpack(wf.get_input_files("arcashla", "run")),
+        output:
+            **wf.get_output_files("arcashla", "run"),
+        threads: wf.get_resource("arcashla", "run", "threads")
+        resources:
+            time=wf.get_resource("arcashla", "run", "time"),
+            memory=wf.get_resource("arcashla", "run", "memory"),
+            partition=wf.get_resource("arcashla", "run", "partition"),
+            tmpdir=wf.get_resource("arcashla", "run", "tmpdir"),
+        log:
+            wf.get_log_file("arcashla", "run"),
+        wrapper:
+            wf.wrapper_path("arcashla/run")

From d43a11b69f3af7378401a28bb390014cb6f6969a Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Mon, 15 Jul 2024 14:02:15 +0200
Subject: [PATCH 19/23] Adding test for neoepitope prediction

---
 ...workflows_somatic_neoepitope_prediction.py | 212 +++++++++++++-----
 1 file changed, 157 insertions(+), 55 deletions(-)

diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
index 057308663..9eb3cc435 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py
@@ -5,6 +5,7 @@
 
 import pytest
 import ruamel.yaml as ruamel_yaml
+from snakemake.io import Wildcards
 
 from snappy_pipeline.workflows.somatic_neoepitope_prediction import (
     SomaticNeoepitopePredictionWorkflow,
@@ -50,19 +51,38 @@ def minimal_config():
             scalpel:
               path_target_regions: /path/to/target/regions.bed
 
+          hla_typing:
+            path_link_in: ''  # OPTIONAL Override data set configuration search paths for FASTQ files
+            tools: [optitype]   # REQUIRED - available: 'optitype' and 'arcashla'
+            optitype:
+                max_reads: 5000
+                num_mapping_threads: 4
+
           somatic_variant_annotation:
             path_somatic_variant_calling: ../somatic_variant_calling
             tools: ["vep"]
             vep:
-              cache_dir: /path/to/dir/cache
-
+                cache_dir: /path/to/dir/cache
+                species: homo_sapiens_merged
+                assembly: GRCh38
+                cache_version: '102'
+                tx_flag: gencode_basic  
+                num_threads: 8
+                buffer_size: 500
+                output_options:
+                - everything
           somatic_neoepitope_prediction:
-            path_somatic_variant_annotation: ../somatic_variant_annotation
+            path_somatic_variant_annotation: ../somatic_variant_annotation  # REQUIRED
+            path_container: ../somatic_neoepitope_prediction/work/containers/out/pvactools.simg
             path_rna_ngs_mapping: ../ngs_mapping
-            tools_somatic_variant_annotation: ["vep"]
+            path_hla_typing: ../hla_typing
+            tools_somatic_variant_annotation: [vep]
             preparation:
-                format: 'snappy_custom'
-                mode: 'gene'
+                format: snappy_custom    # REQUIRED - The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)
+                mode: gene   # REQUIRED - Determine whether the expression file contains gene or transcript TPM values.
+            prediction:
+                algorithms: ['MHCflurry','MHCnuggetsI']
+                
         data_sets:
           first_batch:
             file: sheet.tsv
@@ -96,6 +116,7 @@ def somatic_neoepitope_prediction_workflow(
     dummy_workflow.globals = {
         "ngs_mapping": lambda x: "NGS_MAPPING/" + x,
         "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x,
+        "hla_typing": lambda x: "HLA_TYPING/" + x,
     }
     # Construct the workflow object
     return SomaticNeoepitopePredictionWorkflow(
@@ -107,58 +128,146 @@ def somatic_neoepitope_prediction_workflow(
     )
 
 
-def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_prediction_workflow):
+def test_somatic_neoepitope_preparation_step_part_get_input_files(
+    somatic_neoepitope_prediction_workflow,
+):
+    wildcards = Wildcards(
+        fromdict={
+            "mapper": "bwa",
+            "var_caller": "mutect2",
+            "anno_caller": "vep",
+            "tumor_library": "P001-T1-DNA1-WGS1",
+        }
+    )
     # Define expected
     vcf_base_out = (
         "SOMATIC_VARIANT_ANNOTATION/output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/"
         "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
     )
-    ngs_mapping_base_out = "NGS_MAPPING/output/star.P002-T2-RNA1-mRNA_seq1/out/"
+    # vcf_base_out = (
+    #     "SOMATIC_VARIANT_ANNOTATION/output/bwa.mutect2.vep.P001-T1-DNA1-WES1/out/"
+    #     "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
+    # )
+    ngs_mapping_base_out = "NGS_MAPPING/output/star.P001-T1-RNA1-mRNA_seq1/out/"
     expected = {
         "vcf": vcf_base_out + ".full.vcf.gz",
         "vcf_tbi": vcf_base_out + ".full.vcf.gz.tbi",
-        "expression": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.GeneCounts.tab",
-        "bam": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam",
-        "bai": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam.bai",
+        "expression": ngs_mapping_base_out + "star.P001-T1-RNA1-mRNA_seq1.GeneCounts.tab",
+        "bam": ngs_mapping_base_out + "star.P001-T1-RNA1-mRNA_seq1.bam",
+    }
+
+    # Get actual
+    actual = somatic_neoepitope_prediction_workflow.get_input_files("pvacseq", "prepare")(wildcards)
+    assert actual == expected
+
+
+def test_somatic_neoepitope_prediction_step_part_get_input_files(
+    somatic_neoepitope_prediction_workflow,
+):
+    # Define expected
+    wildcards = Wildcards(
+        fromdict={
+            "mapper": "bwa",
+            "var_caller": "mutect2",
+            "anno_caller": "vep",
+            "tumor_library": "P001-T1-DNA1-WGS1",
+        }
+    )
+    vcf_base_out = (
+        "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/"
+        "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}"
+    )
+    hla_typing_base_out = "HLA_TYPING/output/"
+    expected = {
+        "container": "work/containers/out/pvactools.simg",
+        "combine_vcf": vcf_base_out + ".vcf.gz",
+        "hla_normal_dna": hla_typing_base_out
+        + "optitype.P001-N1-DNA1-WGS1/out/optitype.P001-N1-DNA1-WGS1.txt",
+        "hla_tumor_dna": hla_typing_base_out
+        + "optitype.P001-T1-DNA1-WGS1/out/optitype.P001-T1-DNA1-WGS1.txt",
+        "hla_tumor_rna": hla_typing_base_out
+        + "optitype.P001-T1-RNA1-mRNA_seq1/out/optitype.P001-T1-RNA1-mRNA_seq1.txt",
     }
 
     # Get actual
-    actual = somatic_neoepitope_prediction_workflow.get_input_files("neoepitope_preparation", "run")
+    actual = somatic_neoepitope_prediction_workflow.get_input_files("pvacseq", "predict")(wildcards)
     assert actual == expected
 
 
-def test_neoepitope_preparation_step_part_get_output_files(somatic_neoepitope_prediction_workflow):
+def test_somatic_neoepitope_preparation_step_part_get_output_files(
+    somatic_neoepitope_prediction_workflow,
+):
     base_out = (
-        "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
+        "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
         "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
     )
     expected = get_expected_output_vcf_files_dict(base_out)
-    actual = somatic_neoepitope_prediction_workflow.get_output_files(
-        "neoepitope_preparation", "run"
-    )
+    actual = somatic_neoepitope_prediction_workflow.get_output_files("pvacseq", "prepare")
     assert actual == expected
 
 
-def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_prediction_workflow):
+def test_somatic_neoepitope_preparation_step_part_get_log_files(
+    somatic_neoepitope_prediction_workflow,
+):
     base_out = (
-        "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
+        "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
         "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
     )
     expected = get_expected_log_files_dict(base_out=base_out)
-    actual = somatic_neoepitope_prediction_workflow.get_log_file("neoepitope_preparation", "run")
+    actual = somatic_neoepitope_prediction_workflow.get_log_file("pvacseq", "prepare")
     assert actual == expected
 
 
-def test_neoepitope_preparation_step_part_get_resource(
+def test_somatic_neoepitope_preparation_step_part_get_resource(
     somatic_neoepitope_prediction_workflow,
 ):
     # Define expected
-    expected_dict = {"threads": 2, "time": "1:00:00", "memory": "6144M"}
+    expected_dict = {"threads": 1, "time": "01:00:00", "memory": "6G"}
     # Evaluate
     for resource, expected in expected_dict.items():
         msg_error = f"Assertion error for resource '{resource}'."
         actual = somatic_neoepitope_prediction_workflow.get_resource(
-            "neoepitope_preparation", "run", resource
+            "pvacseq", "prepare", resource
+        )()
+        assert actual == expected, msg_error
+
+
+def test_somatic_neoepitope_prediction_step_part_get_output_files(
+    somatic_neoepitope_prediction_workflow,
+):
+    base_out = "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/out/MHC_Class_I/{tumor_library}"
+    expected = {
+        "all_epitopes": base_out + ".all_epitopes.tsv",
+        "all_epitopes_md5": base_out + ".all_epitopes.tsv.md5",
+        "filtered_epitopes": base_out + ".filtered.tsv",
+        "filtered_epitopes_md5": base_out + ".filtered.tsv.md5",
+    }
+    actual = somatic_neoepitope_prediction_workflow.get_output_files("pvacseq", "predict")
+    assert actual == expected
+
+
+def test_somatic_neoepitope_prediction_step_part_get_log_files(
+    somatic_neoepitope_prediction_workflow,
+):
+    base_out = (
+        "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/"
+        "log/{tumor_library}"
+    )
+    expected = get_expected_log_files_dict(base_out=base_out)
+    actual = somatic_neoepitope_prediction_workflow.get_log_file("pvacseq", "predict")
+    assert actual == expected
+
+
+def test_somatic_neoepitope_prediction_step_part_get_resource(
+    somatic_neoepitope_prediction_workflow,
+):
+    # Define expected
+    expected_dict = {"threads": 4, "time": "4:00:00", "memory": "30G"}
+    # Evaluate
+    for resource, expected in expected_dict.items():
+        msg_error = f"Assertion error for resource '{resource}'."
+        actual = somatic_neoepitope_prediction_workflow.get_resource(
+            "pvacseq", "predict", resource
         )()
         assert actual == expected, msg_error
 
@@ -166,54 +275,47 @@ def test_neoepitope_preparation_step_part_get_resource(
 def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_workflow):
     """Test simple functionality of the workflow"""
     # Check created sub steps
-    expected = ["link_out", "neoepitope_preparation"]
+    expected = ["link_out", "pvacseq"]
     actual = list(sorted(somatic_neoepitope_prediction_workflow.sub_steps.keys()))
     assert actual == expected
 
     # Check result file construction
     tpl = (
-        "output/{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1/{dir_}/"
-        "{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1.{ext}"
+        "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.P00{i}-T{t}-DNA1-WGS1.epitopes/"
+        "out/MHC_Class_I/P00{i}-T{t}-DNA1-WGS1{ext}"
+    )
+    log_tpl = (
+        "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.P00{i}-T{t}-DNA1-WGS1.epitopes/"
+        "log/P00{i}-T{t}-DNA1-WGS1{ext}"
     )
     expected = [
-        tpl.format(
-            mapper=mapper,
-            var_caller=var_caller,
-            anno_caller=anno_caller,
-            i=i,
-            t=t,
-            ext=ext,
-            dir_="out",
+        log_tpl.format(
+            mapper="bwa", var_caller=var_caller, anno_caller="vep", i=i, t=t, ext=ext, mode="GX"
         )
-        for i, t in ((1, 1), (2, 2))
-        for ext in ("vcf.gz", "vcf.gz.tbi", "vcf.gz.md5", "vcf.gz.tbi.md5")
-        for mapper in ("bwa",)
+        for (i, t) in ((1, 1), (2, 2))
         for var_caller in ("mutect2", "scalpel")
-        for anno_caller in ("vep",)
+        for ext in (
+            ".log",
+            ".log.md5",
+            ".conda_info.txt",
+            ".conda_info.txt.md5",
+            ".conda_list.txt",
+            ".conda_list.txt.md5",
+        )
     ]
 
     expected += [
         tpl.format(
-            mapper=mapper,
-            var_caller=var_caller,
-            anno_caller=anno_caller,
-            i=i,
-            t=t,
-            ext=ext,
-            dir_="log",
+            mapper="bwa", var_caller=var_caller, anno_caller="vep", i=i, t=t, ext=ext, mode="GX"
         )
-        for i, t in ((1, 1), (2, 2))
+        for (i, t) in ((1, 1), (2, 2))
+        for var_caller in ("mutect2", "scalpel")
         for ext in (
-            "conda_info.txt",
-            "conda_list.txt",
-            "log",
-            "conda_info.txt.md5",
-            "conda_list.txt.md5",
-            "log.md5",
+            ".all_epitopes.tsv",
+            ".filtered.tsv",
+            ".all_epitopes.tsv.md5",
+            ".filtered.tsv.md5",
         )
-        for mapper in ("bwa",)
-        for var_caller in ("mutect2", "scalpel")
-        for anno_caller in ("vep",)
     ]
     expected = list(sorted(expected))
     actual = list(sorted(somatic_neoepitope_prediction_workflow.get_result_files()))

From 73e08c6dce538bb03277c2e7a58f03ef5f61a2df Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 17 Jul 2024 11:48:51 +0200
Subject: [PATCH 20/23] Adding pvacseq pipeline

---
 .../somatic_neoepitope_prediction/Snakefile   |  58 +++-
 .../somatic_neoepitope_prediction/__init__.py | 280 +++++++++++++-----
 .../somatic_neoepitope_prediction/model.py    |  72 ++++-
 .../pvactools/pvacseq/environment.yaml        |  15 +
 .../wrappers/pvactools/pvacseq/wrapper.py     | 159 ++++++++++
 5 files changed, 491 insertions(+), 93 deletions(-)
 create mode 100644 snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
index de398ddc7..7b18f7238 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile
@@ -26,7 +26,7 @@ wf = SomaticNeoepitopePredictionWorkflow(workflow, config, lookup_paths, config_
 
 localrules:
     # Linking files from work/ to output/ should be done locally
-    neoepitope_preparation_link_out_run,
+    somatic_neoepitope_preparation_link_out_run,
 
 
 rule all:
@@ -37,7 +37,7 @@ rule all:
 # Generic linking out ---------------------------------------------------------
 
 
-rule neoepitope_preparation_link_out_run:
+rule somatic_neoepitope_preparation_link_out_run:
     input:
         wf.get_input_files("link_out", "run"),
     output:
@@ -46,20 +46,54 @@ rule neoepitope_preparation_link_out_run:
         shell(wf.get_shell_cmd("link_out", "run", wildcards))
 
 
-rule neoepitope_preparation:
+rule somatic_neoepitope_prediction_pvactools_install:
+    output:
+        **wf.get_output_files("pvacseq", "install"),
+    params:
+        container="docker://griffithlab/pvactools",
+    resources:
+        time=wf.get_resource("pvacseq", "install", "time"),
+        memory=wf.get_resource("pvacseq", "install", "memory"),
+        partition=wf.get_resource("pvacseq", "install", "partition"),
+    log:
+        **wf.get_log_file("pvacseq", "install"),
+    wrapper:
+        wf.wrapper_path("singularity")
+
+
+rule somatic_neoepitope_preparation:
     input:
-        unpack(wf.get_input_files("pvacseq","prepare")),
+        unpack(wf.get_input_files("pvacseq", "prepare")),
     output:
-        **wf.get_output_files("pvacseq","prepare"),
+        **wf.get_output_files("pvacseq", "prepare"),
     log:
-        **wf.get_log_file("pvacseq","prepare"),
-    threads: wf.get_resource("pvacseq","prepare", "threads")
+        **wf.get_log_file("pvacseq", "prepare"),
+    threads: wf.get_resource("pvacseq", "prepare", "threads")
     resources:
-        time=wf.get_resource("pvacseq","prepare", "time"),
-        memory=wf.get_resource("pvacseq","prepare", "memory"),
-        partition=wf.get_resource("pvacseq","prepare", "partition"),
-        tmpdir=wf.get_resource("pvacseq","prepare", "tmpdir"),
+        time=wf.get_resource("pvacseq", "prepare", "time"),
+        memory=wf.get_resource("pvacseq", "prepare", "memory"),
+        partition=wf.get_resource("pvacseq", "prepare", "partition"),
+        tmpdir=wf.get_resource("pvacseq", "prepare", "tmpdir"),
     params:
-        **{"args": wf.get_params("pvacseq","prepare")},
+        **{"args": wf.get_params("pvacseq", "prepare")},
     wrapper:
         wf.wrapper_path("pvactools/combining")
+
+
+rule somatic_neoepitope_prediction:
+    input:
+        unpack(wf.get_input_files("pvacseq", "predict")),
+    output:
+        **wf.get_output_files("pvacseq", "predict"),
+    log:
+        **wf.get_log_file("pvacseq", "predict"),
+    threads: wf.get_resource("pvacseq", "predict", "threads")
+    resources:
+        time=wf.get_resource("pvacseq", "predict", "time"),
+        memory=wf.get_resource("pvacseq", "predict", "memory"),
+        partition=wf.get_resource("pvacseq", "predict", "partition"),
+        tmpdir=wf.get_resource("pvacseq", "predict", "tmpdir"),
+    params:
+        **{"args": wf.get_params("pvacseq", "predict")},
+    wrapper:
+        wf.wrapper_path("pvactools/pvacseq")
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index cfb3bb82c..7e485d0fe 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -46,6 +46,7 @@
     SOMATIC_VARIANT_CALLERS_MATCHED,
     SomaticVariantCallingWorkflow,
 )
+from snappy_pipeline.workflows.hla_typing import HlaTypingWorkflow
 from .model import SomaticNeoepitopePrediction as SomaticNeoepitopePredictionConfigModel
 
 
@@ -53,8 +54,13 @@
 __email__ = "pham.gia-cuong@bih-charite.de"
 
 #: Extensions of files to create as main payload
-EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5")
-
+PREPARE_EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5")
+PREDICT_EXT_VALUES = (
+    ".all_epitopes.tsv",
+    ".filtered.tsv",
+    ".all_epitopes.tsv.md5",
+    ".filtered.tsv.md5",
+)
 #: Default configuration for the somatic_gene_fusion_calling step
 DEFAULT_CONFIG = SomaticNeoepitopePredictionConfigModel.default_config_yaml_string()
 
@@ -66,7 +72,28 @@ class PvacSeqStepPart(BaseStepPart):
 
     #: Step name
     name = "pvacseq"
-    actions = ("prepare","predict")
+
+    #: Actions
+    actions = ("install", "prepare", "predict")
+
+    #: Resources
+    resource_usage = {
+        "install": ResourceUsage(
+            threads=2,
+            time="03:00:00",
+            memory="6G",
+        ),
+        "prepare": ResourceUsage(
+            threads=1,
+            time="01:00:00",
+            memory="6G",
+        ),
+        "predict": ResourceUsage(
+            threads=4,
+            time="4:00:00",
+            memory="30G",
+        ),
+    }
 
     def __init__(self, parent):
         super().__init__(parent)
@@ -84,35 +111,50 @@ def __init__(self, parent):
 
     def get_input_files(self, action):
         self._validate_action(action)
-        return self._get_input_files_prepare
-
-    # @dictify
-    # def _get_input_files_predict(self,wildcards):
-    #     prepare_tpl = (
-    #         "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/"
-    #         "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.vcf.gz"
-    #     )
-    #     yield "combine_vcf",prepare_tpl
-    #     hla_typing = self.parent.sub_workflows["hla_typing"]
-    #     hla_tpl = {
-    #         "output/{hla_tool}.{ngs_library}/out/{hla_tool}.{ngs_library}.txt"
-    #     }
-    #     for sheet in filter(is_not_background, self.parent.shortcut_sheets):
-    #         for sample_pair in sheet.all_sample_pairs:
-    #             if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library:
-    #                 yield "hla_normal_dna", hla_typing((hla_tpl).format(
-    #                                     hla_tool=self.w_config.step_config["hla_typing"].tools[0],
-    #                                     ngs_library=sample_pair.normal_sample.dna_ngs_library.name,)
-    #                                     )
-    #                 yield "hla_tumor_dna", hla_typing((hla_tpl).format(
-    #                                     hla_tool=self.w_config.step_config["hla_typing"].tools[0],
-    #                                     ngs_library=sample_pair.tumor_sample.dna_ngs_library.name,)
-    #                                     )
-    #                 yield "hla_tumor_rna", hla_typing((hla_tpl).format(
-    #                                     hla_tool=self.w_config.step_config["hla_typing"].tools[0],
-    #                                     ngs_library=sample_pair.tumor_sample.rna_ngs_library.name,)
-    #                                     )
-    #     pass
+        # if action == "install":
+        #     return {"container": "work/containers/out/pvactools.simg"}
+        if action == "prepare":
+            return self._get_input_files_prepare
+        if action == "predict":
+            return self._get_input_files_predict
+
+    @dictify
+    def _get_input_files_predict(self, wildcards):
+        yield "container", "work/containers/out/pvactools.simg"
+        prepare_tpl = (
+            "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/"
+            "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.vcf.gz"
+        )
+        yield "combine_vcf", prepare_tpl
+        hla_typing = self.parent.sub_workflows["hla_typing"]
+        hla_tpl = "output/optitype.{ngs_library}/out/optitype.{ngs_library}.txt"
+        for sheet in filter(is_not_background, self.parent.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library:
+                    yield (
+                        "hla_normal_dna",
+                        hla_typing(
+                            (hla_tpl).format(
+                                ngs_library=sample_pair.normal_sample.dna_ngs_library.name,
+                            )
+                        ),
+                    )
+                    yield (
+                        "hla_tumor_dna",
+                        hla_typing(
+                            (hla_tpl).format(
+                                ngs_library=sample_pair.tumor_sample.dna_ngs_library.name,
+                            )
+                        ),
+                    )
+                    yield (
+                        "hla_tumor_rna",
+                        hla_typing(
+                            (hla_tpl).format(
+                                ngs_library=sample_pair.tumor_sample.rna_ngs_library.name,
+                            )
+                        ),
+                    )
 
     @dictify
     def _get_input_files_prepare(self, wildcards):
@@ -120,6 +162,7 @@ def _get_input_files_prepare(self, wildcards):
         # It only works with vep now.
         # Validate action
         # self._validate_action(action)
+        # yield "container","work/containers/out/pvactools.simg"
         tpl = (
             "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/"
             "{mapper}.{var_caller}.{anno_caller}.{tumor_library}"
@@ -130,32 +173,42 @@ def _get_input_files_prepare(self, wildcards):
             vep_key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"}
         variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"]
         if self.config["preparation"]["format"] == "snappy_custom":
-            for key,ext in vep_key_ext.items():
-                yield key,variant_annotation(tpl+ext)
+            for key, ext in vep_key_ext.items():
+                yield key, variant_annotation(tpl + ext)
 
         ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
-
-        star_key_ext = {"expression": ".GeneCounts.tab",
-                        "bam": ".bam"}
+        star_key_ext = {"expression": ".GeneCounts.tab", "bam": ".bam"}
         for sheet in filter(is_not_background, self.parent.shortcut_sheets):
             for sample_pair in sheet.all_sample_pairs:
                 if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library:
-                    rna_tpl = (
-                        "output/{mapper}.{rna_library}/out/{mapper}.{rna_library}"
-                    )
-                    for key,ext in star_key_ext.items():
-                        yield key, ngs_mapping((rna_tpl + ext).format(
-                                        mapper=self.w_config.step_config["ngs_mapping"].tools.rna[0],
-                                        rna_library=sample_pair.tumor_sample.rna_ngs_library.name,)
-                                        )
-
-                    
+                    rna_tpl = "output/{mapper}.{rna_library}/out/{mapper}.{rna_library}"
+                    for key, ext in star_key_ext.items():
+                        yield (
+                            key,
+                            ngs_mapping(
+                                (rna_tpl + ext).format(
+                                    mapper=self.w_config.step_config["ngs_mapping"].tools.rna[0],
+                                    rna_library=sample_pair.tumor_sample.rna_ngs_library.name,
+                                )
+                            ),
+                        )
 
     @dictify
     def get_output_files(self, action):
         """Return output files"""
         # Validate action
         self._validate_action(action)
+        if action == "install":
+            yield "container", "work/containers/out/pvactools.simg"
+        if action == "prepare":
+            path_prepare = self._get_output_files_prepare()
+            yield from path_prepare.items()
+        if action == "predict":
+            path_predict = self._get_output_files_predict()
+            yield from path_predict.items()
+
+    @dictify
+    def _get_output_files_prepare(self):
         if self.config["preparation"]["mode"] == "gene":
             prefix = (
                 "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/"
@@ -171,43 +224,68 @@ def get_output_files(self, action):
             yield key, prefix + ext
             yield key + "_md5", prefix + ext + ".md5"
 
+    @dictify
+    def _get_output_files_predict(self):
+        key_ext = {"all_epitopes": ".all_epitopes.tsv", "filtered_epitopes": ".filtered.tsv"}
+        prefix = "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/out/MHC_Class_I/{tumor_library}"
+        for key, ext in key_ext.items():
+            yield key, prefix + ext
+            yield key + "_md5", prefix + ext + ".md5"
+
     @dictify
     def _get_log_file(self, action):
         """Return mapping of log files."""
         # Validate action
         self._validate_action(action)
-        if self.config["preparation"]["mode"] == "gene":
-            prefix = (
-                "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
-                "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+        if action == "install":
+            key_ext = (
+                ("log", ".log"),
+                ("conda_info", ".conda_info.txt"),
+                ("conda_list", ".conda_list.txt"),
             )
-        elif self.config["preparation"]["mode"] == "transcript":
-            prefix = (
-                "work/preprare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/"
-                "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
+            for key, ext in key_ext:
+                yield key, "work/containers/log/pvactool_install" + ext
+
+        if action == "prepare":
+            if self.config["preparation"]["mode"] == "gene":
+                prefix = (
+                    "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/"
+                    "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}"
+                )
+            elif self.config["preparation"]["mode"] == "transcript":
+                prefix = (
+                    "work/preprare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/"
+                    "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}"
+                )
+            key_ext = (
+                ("log", ".log"),
+                ("conda_info", ".conda_info.txt"),
+                ("conda_list", ".conda_list.txt"),
             )
-        key_ext = (
-            ("log", ".log"),
-            ("conda_info", ".conda_info.txt"),
-            ("conda_list", ".conda_list.txt"),
-        )
-        for key, ext in key_ext:
-            yield key, prefix + ext
+            for key, ext in key_ext:
+                yield key, prefix + ext
 
-    def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage:
-        self._validate_action(action)
-        mem_mb = 6 * 1024  # 6GB
-        return ResourceUsage(
-            threads=2,
-            time="1:00:00",  # 1 hour
-            memory=f"{mem_mb}M",
-        )
+        if action == "predict":
+            prefix = (
+                "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/"
+                "log/{tumor_library}"
+            )
+            key_ext = (
+                ("log", ".log"),
+                ("conda_info", ".conda_info.txt"),
+                ("conda_list", ".conda_list.txt"),
+            )
+            for key, ext in key_ext:
+                yield key, prefix + ext
 
     def get_params(self, action):
         self._validate_action(action)
-        return getattr(self,"_get_params_run")
-    
-    def _get_params_run(self,wildcards):
+        if action == "prepare":
+            return getattr(self, "_get_params_run_prepare")
+        if action == "predict":
+            return getattr(self, "_get_params_run_predict")
+
+    def _get_params_run_prepare(self, wildcards):
         d = {
             "max_depth": self.config["preparation"]["max_depth"],
             "format": self.config["preparation"]["format"],
@@ -218,6 +296,42 @@ def _get_params_run(self,wildcards):
         }
         return d
 
+    def _get_params_run_predict(self, wildcards):
+        d = {
+            "epitope_length": ",".join(
+                map(str, self.config["prediction"]["CLASS_I_EPITOPE_LENGTH"])
+            ),
+            "algorithms": " ".join(self.config["prediction"]["algorithms"]),
+            "BINDING_THRESHOLD": self.config["prediction"]["BINDING_THRESHOLD"],
+            "percentile_threshold": self.config["prediction"]["percentile_threshold"],
+            "allele_specific_binding_thresholds": self.config["prediction"][
+                "allele_specific_binding_thresholds"
+            ],
+            "aggregate_inclusion_binding_threshold": self.config["prediction"][
+                "aggregate_inclusion_binding_threshold"
+            ],
+            "netmhc_stab": self.config["prediction"]["netmhc_stab"],
+            "NET_CHOP_THRESHOLD": self.config["prediction"]["NET_CHOP_THRESHOLD"],
+            "PROBLEMATIC_AMINO_ACIDS": self.config["prediction"]["PROBLEMATIC_AMINO_ACIDS"],
+            # 'run_reference_proteome_similarity': self.config["prediction"]["run_reference_proteome_similarity"],
+            "FAST_SIZE": self.config["prediction"]["FAST_SIZE"],
+            "exclude_NAs": self.config["prediction"]["exclude_NAs"],
+            "NORMAL_COV": self.config["prediction"]["NORMAL_COV"],
+            "TDNA_COV": self.config["prediction"]["TDNA_COV"],
+            "TRNA_COV": self.config["prediction"]["TRNA_COV"],
+            "NORMAL_VAF": self.config["prediction"]["NORMAL_VAF"],
+            "maximum_transcript_support_level": self.config["prediction"][
+                "maximum_transcript_support_level"
+            ],
+            "pass_only": self.config["prediction"]["pass_only"],
+            "tumor_purity": self.config["prediction"]["tumor_purity"],
+        }
+        for sheet in filter(is_not_background, self.parent.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library:
+                    d["normal_sample"] = sample_pair.normal_sample.dna_ngs_library.name
+        return d
+
 
 class SomaticNeoepitopePredictionWorkflow(BaseStep):
     """Perform neoepitope prediction workflow"""
@@ -245,6 +359,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
                 SomaticVariantCallingWorkflow,
                 SomaticVariantAnnotationWorkflow,
                 NgsMappingWorkflow,
+                HlaTypingWorkflow,
             ),
         )
         # Register sub step classes so the sub steps are available
@@ -258,6 +373,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             "ngs_mapping",
             config["path_rna_ngs_mapping"],
         )
+        self.register_sub_workflow(
+            "hla_typing",
+            config["path_hla_typing"],
+        )
         # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here
         if not config.tools_ngs_mapping:
             config.tools_ngs_mapping = self.w_config["step_config"]["ngs_mapping"].tools.dna
@@ -274,21 +393,22 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
 
     @listify
     def get_result_files(self):
+        cfg_mode = self.config["preparation"]["mode"]
         callers = set(self.config["tools_somatic_variant_calling"])
         anno_callers = set(self.config["tools_somatic_variant_annotation"])
-        if self.config["preparation"]["mode"] == "gene":
-            name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}"
-        elif self.config["preparation"]["mode"] == "transcript":
-            name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}"
+        predict_tpl = "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library.name}.epitopes/"
         yield from self._yield_result_files_matched(
-            os.path.join("output/prepare", name_pattern, "out", name_pattern + "{ext}"),
+            predict_tpl + "out/MHC_Class_I/{tumor_library.name}" + "{ext}",
             mapper=self.config["tools_ngs_mapping"],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
-            ext=EXT_VALUES,
+            mode="GX" if cfg_mode == "gene" else "TX",
+            ext=PREDICT_EXT_VALUES,
         )
+
         yield from self._yield_result_files_matched(
-            os.path.join("output/prepare", name_pattern, "log", name_pattern + "{ext}"),
+            predict_tpl + "log/{tumor_library.name}" + "{ext}",
+            mode="GX" if cfg_mode == "gene" else "TX",
             mapper=self.config["tools_ngs_mapping"],
             var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED),
             anno_caller=anno_callers,
diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
index 7abe6815f..6788a2bb4 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
@@ -32,13 +32,82 @@ class Preparation(SnappyModel):
     ignore_ensembl_id_version: bool = True
     """Ignore the ensemble id version"""
     max_depth: int = 4000
+    "max_depth option of bcftools mpileup"
+
+
+class Algorithms(enum.StrEnum):
+    BigMHC_EL = "BigMHC_EL"
+    BigMHC_IM = "BigMHC_IM"
+    DeepImmuno = "DeepImmuno"
+    MHCflurry = "MHCflurry"
+    MHCflurryEL = "MHCflurryEL"
+    MHCnuggetsI = "MHCnuggetsI"
+    MHCnuggetsII = "MHCnuggetsII"
+    NNalign = "NNalign"
+    NetMHC = "NetMHC"
+    NetMHCIIpan = "NetMHCIIpan"
+    NetMHCIIpanEL = "NetMHCIIpanEL"
+    NetMHCcons = "NetMHCcons"
+    NetMHCpan = "NetMHCpan"
+    NetMHCpanEL = "NetMHCpanEL"
+    PickPocket = "PickPocket"
+    SMM = "SMM"
+    SMMPMBEC = "SMMPMBEC"
+    SMMalign = "SMMalign"
+    all_class_i = "all_class_i"
+
+
+class Prediction(SnappyModel):
+    CLASS_I_EPITOPE_LENGTH: list[int] = [8, 9, 10, 11]
+    "Length of MHC Class I subpeptides (neoepitopes) to predict."
+    "Multiple epitope lengths can be specified using a comma-separated list."
+    "Typical epitope lengths vary between 8-15."
+    algorithms: Annotated[list[Algorithms], EnumField(Algorithms, [])]
+    "The epitope prediction algorithms to use."
+    "For running with all prediction assign to ['all_class_i']"
+    # Following parameter please follow the documentation of pvactools to know its function
+    BINDING_THRESHOLD: int = 500
+    percentile_threshold: float | None = Field(default=None)
+    allele_specific_binding_thresholds: bool = False
+    aggregate_inclusion_binding_threshold: int = 5000
+    netmhc_stab: bool = False
+    NET_CHOP_THRESHOLD: float = 0.5
+    PROBLEMATIC_AMINO_ACIDS: list[str] | None = Field(default=None)
+    # top-score-metric
+    # net-chop-method
+    # net-chop-threshold
+
+    # E.g amino_acid:position - G:2 would check for a Glycine at
+    # the second position of the epitope
+    # run_reference_proteome_similarity: bool = False
+    # blastp_path
+    # blastp-db
+    # peptide-fasta
+    FAST_SIZE: int = 200
+    exclude_NAs: bool = False
+    NORMAL_COV: int = 5
+    TDNA_COV: int = 10
+    TRNA_COV: int = 10
+    NORMAL_VAF: float = 0.02
+    maximum_transcript_support_level: int = Field(gt=0, lt=6, default=1)
+    # allele-specific-anchors
+    # anchor_contribution_threshold: float=0.8
+    pass_only: bool = False
+    tumor_purity: float | None = Field(default=None)
 
 
 class SomaticNeoepitopePrediction(SnappyStepModel):
     path_somatic_variant_annotation: Annotated[
         str, Field(examples=["../somatic_variant_annotation"])
     ]
-    path_hla_typing : Annotated[str, Field(examples=["../hla_typing"])]
+    path_container: Annotated[
+        str, Field(examples=["../somatic_neoepitope_prediction/work/containers/out/pvactools.simg"])
+    ]
+    """
+    Running somatic neoepitope prediction with pvactools 
+    is required,with the container
+    """
+    path_hla_typing: Annotated[str, Field(examples=["../hla_typing"])]
     path_rna_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])]
     tools_somatic_variant_annotation: list[str] = ["vep"]
     tools_ngs_mapping: list[str] = []
@@ -50,3 +119,4 @@ class SomaticNeoepitopePrediction(SnappyStepModel):
     tools_somatic_variant_calling: list[str] = []
     """Deafult to those configured for somatic_variant_calling"""
     preparation: Preparation | None = None
+    prediction: Prediction | None = None
diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml b/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml
new file mode 100644
index 000000000..7ef7fd65f
--- /dev/null
+++ b/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml
@@ -0,0 +1,15 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+  - default
+dependencies:
+  - python==3.6.15
+  # - pip==21.3.1
+  # - pip:
+  #     - pvactools==3.1.3
+  #     - mhcflurry==2.0.1
+  #     - mhcnuggets==2.3.3
+  #     - git+https://github.com/griffithlab/bigmhc.git#egg=bigmhc
+  #     - git+https://github.com/griffithlab/deepimmuno.git#egg=deepimmuno
+  #     - tensorflow==1.5.0
\ No newline at end of file
diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
index e69de29bb..1b9c3cb9f 100644
--- a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
@@ -0,0 +1,159 @@
+import os
+
+from snakemake.shell import shell
+
+__author__ = "Pham Gia Cuong"
+__email__ = "pham.gia-cuong@bih-charite.de"
+
+step = snakemake.config["pipeline_step"]["name"]
+config = snakemake.config["step_config"][step]
+
+algorithms = snakemake.params.args["algorithms"]
+epitope_length = snakemake.params.args["epitope_length"]
+normal_smaple = snakemake.params.args["normal_sample"]
+BINDING_THRESHOLD = f"-b {snakemake.params.args["BINDING_THRESHOLD"]} "
+percentile_threshold = (
+    f"--percentile-threshold {snakemake.params.args["percentile_threshold"]} "
+    if snakemake.params.args["percentile_threshold"] != None
+    else ""
+)
+
+
+allele_specific_binding_thresholds = (
+    f"--allele-specific-binding-thresholds "
+    if snakemake.params.args["allele_specific_binding_thresholds"]
+    else ""
+)
+
+aggregate_inclusion_binding_threshold = (
+    f"--aggregate-inclusion-binding-threshold {snakemake.params.args["aggregate_inclusion_binding_threshold"]} ",
+)
+
+netmhc_stab = f"--netmhc-stab " if snakemake.params.args["netmhc_stab"] else ""
+
+NET_CHOP_THRESHOLD = (f"--net-chop-threshold {snakemake.params.args["NET_CHOP_THRESHOLD"]} ",)
+
+PROBLEMATIC_AMINO_ACIDS = (
+    f"--problematic-amino-acids {snakemake.params.args["PROBLEMATIC_AMINO_ACIDS"]} ",
+)
+
+# if {snakemake.params.args["run_reference_proteome_similarity"]}:
+#     run_reference_proteome_similarity= f"--run-reference-proteome-similarity",
+
+FAST_SIZE = (f"-s {snakemake.params.args["FAST_SIZE"]} ",)
+
+exclude_NAs = "--exclude-NAs " if snakemake.params.args["exclude_NAs"] else ""
+
+NORMAL_COV = (f"--normal-cov {snakemake.params.args["NORMAL_COV"]} ",)
+TDNA_COV = (f"--tdna-cov {snakemake.params.args["TDNA_COV"] }",)
+TRNA_COV = (f"--trna-cov {snakemake.params.args["TRNA_COV"]} ",)
+NORMAL_VAF = (f"--normal-vaf {snakemake.params.args["NORMAL_VAF"]} ",)
+
+maximum_transcript_support_level = (
+    f"--maximum-transcript-support-level {snakemake.params.args["maximum_transcript_support_level"]} ",
+)
+
+pass_only = "--pass-only " if snakemake.params.args["pass_only"] else ""
+
+tumor_purity = (
+    f"--tumor-purity {snakemake.params.args["tumor_purity"]} "
+    if snakemake.params.args["tumor_purity"] != None
+    else ""
+)
+
+op_dir = "/".join(snakemake.output.all_epitopes.split("/")[:-2])
+
+files_to_bind = {
+    "combine_vcf": snakemake.input.combine_vcf,
+    "op_dir": op_dir,
+}
+
+# TODO: Put the following in a function (decide where...)
+# Replace with full absolute paths
+files_to_bind = {k: os.path.realpath(v) for k, v in files_to_bind.items()}
+# Directories that mut be bound
+dirs_to_bind = {k: os.path.dirname(v) for k, v in files_to_bind.items()}
+# List of unique directories to bind: on cluster: <directory> -> from container: /bindings/d<i>)
+bound_dirs = {e[1]: e[0] for e in enumerate(list(set(dirs_to_bind.values())))}
+# Binding command
+bindings = " ".join(["-B {}:/bindings/d{}".format(k, v) for k, v in bound_dirs.items()])
+bound_files = {
+    k: "/bindings/d{}/{}".format(bound_dirs[dirs_to_bind[k]], os.path.basename(v))
+    for k, v in files_to_bind.items()
+}
+
+shell.executable("/bin/bash")
+
+shell(
+    r"""
+# -----------------------------------------------------------------------------
+# Redirect stderr to log file by default and enable printing executed commands
+# Also pipe stderr to log file
+if [[ -n "{snakemake.log.log}" ]]; then
+    if [[ "$(set +e; tty; set -e)" != "" ]]; then
+        rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
+        exec 2> >(tee -a "{snakemake.log.log}" >&2)
+    else
+        rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
+        echo "No tty, logging disabled" >"{snakemake.log.log}"
+    fi
+fi
+# Write out information about conda installation.
+conda list >{snakemake.log.conda_list}
+conda info >{snakemake.log.conda_info}
+md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5}
+md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5}
+
+# Setup auto-cleaned tmpdir
+export TMPDIR=$(mktemp -d)
+trap "rm -rf $TMPDIR" EXIT
+
+# Compute md5 checksum
+md5() {{
+    fn=$1
+    d=$(dirname $fn)
+    f=$(basename $fn)
+    pushd $d 1> /dev/null 2>&1
+    checksum=$(md5sum $f)
+    popd 1> /dev/null 2>&1
+    echo "$checksum"
+}}
+
+set -x
+# -----------------------------------------------------------------------------
+
+# Write out information about conda installation
+conda list > {snakemake.log.conda_list}
+conda info > {snakemake.log.conda_info}
+
+hla_types=$(cat {snakemake.input.hla_normal_dna} {snakemake.input.hla_tumor_dna} {snakemake.input.hla_tumor_rna} | sed 's/^/HLA-/' | sort | uniq | tr '\n' ',' | sed 's/,$//')
+cmd="pvacseq run --normal-sample-name {normal_smaple} \
+    -e1 {epitope_length} \
+    --iedb-install-directory /opt/iedb \
+    -t {snakemake.threads} \
+    {bound_files[combine_vcf]} \
+    {snakemake.wildcards.tumor_library} \
+    $hla_types \
+    {algorithms} \
+    {bound_files[op_dir]} \
+    {BINDING_THRESHOLD}\
+    {percentile_threshold}\
+    {allele_specific_binding_thresholds}\
+    {aggregate_inclusion_binding_threshold}\
+    {NET_CHOP_THRESHOLD}\
+    {PROBLEMATIC_AMINO_ACIDS}\
+    {FAST_SIZE}\
+    {NORMAL_COV}\
+    {TDNA_COV}\
+    {TRNA_COV}\
+    {NORMAL_VAF}\
+    {exclude_NAs}\
+    {maximum_transcript_support_level}"
+echo 'TMPDIR=/bindings/d2' > $TMPDIR/{snakemake.wildcards.tumor_library}.sh
+echo $cmd >> $TMPDIR/{snakemake.wildcards.tumor_library}.sh
+apptainer exec --home $PWD -B $TMPDIR:/bindings/d2 {bindings} {config[path_container]} bash /bindings/d2/{snakemake.wildcards.tumor_library}.sh
+
+md5 {snakemake.output.all_epitopes} > {snakemake.output.all_epitopes_md5}
+md5 {snakemake.output.filtered_epitopes} > {snakemake.output.filtered_epitopes_md5}
+"""
+)

From 96702927838805364ee1e151fcd93cd8b2c42bb8 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 17 Jul 2024 12:09:05 +0200
Subject: [PATCH 21/23] make lint happy

---
 .../workflows/somatic_neoepitope_prediction/__init__.py          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
index 7e485d0fe..9721a32f1 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py
@@ -32,7 +32,6 @@
 """
 
 from collections import OrderedDict
-import os
 import sys
 
 from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background

From 76e0c9735a4ea6676e5413503d7908e76d78e6c6 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 17 Jul 2024 12:19:00 +0200
Subject: [PATCH 22/23] Make linting satisfying

---
 .../workflows/somatic_neoepitope_prediction/model.py   |  2 +-
 snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py  | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
index 6788a2bb4..a17452d03 100644
--- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
+++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py
@@ -104,7 +104,7 @@ class SomaticNeoepitopePrediction(SnappyStepModel):
         str, Field(examples=["../somatic_neoepitope_prediction/work/containers/out/pvactools.simg"])
     ]
     """
-    Running somatic neoepitope prediction with pvactools 
+    Running somatic neoepitope prediction with pvactools
     is required,with the container
     """
     path_hla_typing: Annotated[str, Field(examples=["../hla_typing"])]
diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
index 1b9c3cb9f..ec4245fd1 100644
--- a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
+++ b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py
@@ -1,6 +1,6 @@
 import os
 
-from snakemake.shell import shell
+from snakemake import shell
 
 __author__ = "Pham Gia Cuong"
 __email__ = "pham.gia-cuong@bih-charite.de"
@@ -14,13 +14,13 @@
 BINDING_THRESHOLD = f"-b {snakemake.params.args["BINDING_THRESHOLD"]} "
 percentile_threshold = (
     f"--percentile-threshold {snakemake.params.args["percentile_threshold"]} "
-    if snakemake.params.args["percentile_threshold"] != None
+    if snakemake.params.args["percentile_threshold"] is not None
     else ""
 )
 
 
 allele_specific_binding_thresholds = (
-    f"--allele-specific-binding-thresholds "
+    "--allele-specific-binding-thresholds "
     if snakemake.params.args["allele_specific_binding_thresholds"]
     else ""
 )
@@ -29,7 +29,7 @@
     f"--aggregate-inclusion-binding-threshold {snakemake.params.args["aggregate_inclusion_binding_threshold"]} ",
 )
 
-netmhc_stab = f"--netmhc-stab " if snakemake.params.args["netmhc_stab"] else ""
+netmhc_stab = "--netmhc-stab " if snakemake.params.args["netmhc_stab"] else ""
 
 NET_CHOP_THRESHOLD = (f"--net-chop-threshold {snakemake.params.args["NET_CHOP_THRESHOLD"]} ",)
 
@@ -57,7 +57,7 @@
 
 tumor_purity = (
     f"--tumor-purity {snakemake.params.args["tumor_purity"]} "
-    if snakemake.params.args["tumor_purity"] != None
+    if snakemake.params.args["tumor_purity"] is not None
     else ""
 )
 

From 03c9714a9e8f642d225043bb013cad88ec487d75 Mon Sep 17 00:00:00 2001
From: giacuong171 <giacuong171@gmail.com>
Date: Wed, 17 Jul 2024 12:24:51 +0200
Subject: [PATCH 23/23] Reformat hla_typing snakefile

---
 snappy_pipeline/workflows/hla_typing/Snakefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/snappy_pipeline/workflows/hla_typing/Snakefile b/snappy_pipeline/workflows/hla_typing/Snakefile
index 5b07da66d..3dc829c5a 100644
--- a/snappy_pipeline/workflows/hla_typing/Snakefile
+++ b/snappy_pipeline/workflows/hla_typing/Snakefile
@@ -53,6 +53,7 @@ rule hla_typing_link_in_run:
 
 # Generic linking out ---------------------------------------------------------
 
+
 rule hla_typing_link_out_run:
     input:
         wf.get_input_files("link_out", "run"),
@@ -67,6 +68,7 @@ rule hla_typing_link_out_run:
 # OptiType --------------------------------------------------------------------
 
 if "optitype" in wf.w_config.step_config["hla_typing"].tools:
+
     rule hla_typing_optitype_run:
         input:
             **wf.get_input_files("optitype", "run"),
@@ -91,6 +93,7 @@ if "optitype" in wf.w_config.step_config["hla_typing"].tools:
 
 # NB: reference is updated in the installed package
 if "arcashla" in wf.w_config.step_config["hla_typing"].tools:
+
     rule hla_typing_arcashla_prepare_reference:
         output:
             touch("work/arcashla.prepare_reference/out/.done"),
@@ -99,7 +102,6 @@ if "arcashla" in wf.w_config.step_config["hla_typing"].tools:
         wrapper:
             wf.wrapper_path("arcashla/prepare_reference")
 
-
     rule hla_typing_arcashla_run:
         input:
             unpack(wf.get_input_files("arcashla", "run")),