From 72717b80642ac4a2ffc3d1afd5aa438539376bec Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 4 Apr 2024 11:07:11 +0200 Subject: [PATCH 01/23] Adding somatic neoepitope preparation workflow --- .../somatic_neoepitope_prediction/Snakefile | 81 ++++ .../somatic_neoepitope_prediction/__init__.py | 313 +++++++++++++- .../wrappers/pvactools/combining/comb_rna.py | 392 ++++++++++++++++++ .../pvactools/combining/environment.yaml | 10 + .../wrappers/pvactools/combining/wrapper.py | 85 ++++ .../wrappers/pvactools/environment.yaml | 0 .../wrappers/pvactools/pvacseq/wrapper.py | 0 .../wrappers/stringtie/environment.yaml | 5 + snappy_wrappers/wrappers/stringtie/wrapper.py | 47 +++ ...workflows_somatic_neoepitope_prediction.py | 218 ++++++++++ 10 files changed, 1149 insertions(+), 2 deletions(-) create mode 100644 snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile create mode 100644 snappy_wrappers/wrappers/pvactools/combining/comb_rna.py create mode 100644 snappy_wrappers/wrappers/pvactools/combining/environment.yaml create mode 100644 snappy_wrappers/wrappers/pvactools/combining/wrapper.py create mode 100644 snappy_wrappers/wrappers/pvactools/environment.yaml create mode 100644 snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py create mode 100644 snappy_wrappers/wrappers/stringtie/environment.yaml create mode 100644 snappy_wrappers/wrappers/stringtie/wrapper.py create mode 100644 tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile new file mode 100644 index 000000000..0d1870dd9 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline tumor mutational burden step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_neoepitope_prediction import ( + SomaticNeoepitopePredictionWorkflow, +) + +__author__ = "Pham Gia Cuong" + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== +wf = SomaticNeoepitopePredictionWorkflow( + workflow, config, lookup_paths, config_paths, os.getcwd() +) + + +localrules: + # Linking files from work/ to output/ should be done locally + neoepitope_preparation_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# Generic linking out --------------------------------------------------------- + +rule neoepitope_preparation_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + +# rule rna_pileup: +# input: +# **wf.get_input_files("rna_pileup","run") +# output: +# **wf.get_output_files("rna_pileup","run") +# log: +# **wf.get_log_file("rna_pileup", "run"), +# threads: +# wf.get_resource("rna_pileup", "run", "threads"), +# resources: +# time=wf.get_resource("rna_pileup", "run", "time"), +# memory=wf.get_resource("rna_pileup", "run", "memory"), +# partition=wf.get_resource("rna_pileup", "run", "partition"), +# tmpdir=wf.get_resource("rna_pileup", "run", "tmpdir"), +# wrapper: +# wf.wrapper_path("pvactools/mpileup") + + +rule neoepitope_preparation: + input: + **wf.get_input_files("neoepitope_preparation", "run"), + output: + **wf.get_output_files("neoepitope_preparation", "run"), + log: + **wf.get_log_file("neoepitope_preparation", "run"), + threads:wf.get_resource("neoepitope_preparation", "run", "threads"), + resources: + time=wf.get_resource("neoepitope_preparation", "run", "time"), + memory=wf.get_resource("neoepitope_preparation", "run", "memory"), + partition=wf.get_resource("neoepitope_preparation", "run", "partition"), + tmpdir=wf.get_resource("neoepitope_preparation", "run", "tmpdir"), + wrapper: + wf.wrapper_path("pvactools/combining") diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index 7486c2e22..69afb23e9 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -31,11 +31,320 @@ """ -__author__ = "Manuel Holtgrewe " +from collections import OrderedDict +import os +import sys + +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import expand + +from snappy_pipeline.utils import dictify, listify +from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart +from snappy_pipeline.workflows.ngs_mapping import ( + READ_MAPPERS_RNA, + NgsMappingWorkflow, + ResourceUsage +) +from snappy_pipeline.workflows.somatic_variant_calling import ( + SOMATIC_VARIANT_CALLERS_MATCHED, + SomaticVariantCallingWorkflow, +) +from snappy_pipeline.workflows.somatic_variant_annotation import ( + ANNOTATION_TOOLS, + SomaticVariantAnnotationWorkflow +) + + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +#: Extensions of files to create as main payload +EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") #: Default configuration for the somatic_gene_fusion_calling step DEFAULT_CONFIG = r""" step_config: somatic_neoepitope_prediction: - path_somatic_variant_calling: REQUIRED # REQUIRED + path_somatic_variant_annotation: ../somatic_variant_annotation # REQUIRED + path_rna_ngs_mapping: ../ngs_mapping + tools_somatic_variant_annotation: [] + tools_rna_mapping: [] # deafult to those configured for ngs_mapping + tools_ngs_mapping: [] # deafult to those configured for ngs_mapping + tools_somatic_variant_calling: [] # deafult to those configured for somatic_variant_calling + max_depth: "4000" + preparation: + format: 'star' # REQUIRED - The file format of the expression file to process. (stringtie,kallisto,cufflinks,custom) + # Use `custom` to process file formats not explicitly supported. + # The `custom` option requires the use of the --id-column and --expression-column arguments. + path_features: '' + mode: 'gene' # REQUIRED + id-column: '' # The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format. + expression-column: 'fr' # REQUIRED + # The column header in the expression_file for + # the column containing expression data. + # Required when using the `custom` and `star` format. For `star`, there are + # only 3 options [unstranded,rf,fr] + ignore-ensembl-id-version: True #Ignore the ensemble id version """ + + +class NeoepitopePreparationStepPart(BaseStepPart): + """ + Preparation VCF file for pvactool + """ + #: Step name + name = "neoepitope_preparation" + actions = ("run",) + + def __init__(self, parent): + super().__init__(parent) + # Build shortcut from cancer bio sample name to matched cancer sample + self.tumor_ngs_library_to_sample_pair = OrderedDict() + for sheet in self.parent.shortcut_sheets: + self.tumor_ngs_library_to_sample_pair.update( + sheet.all_sample_pairs_by_tumor_dna_ngs_library + ) + # Build mapping from donor name to donor. + self.donors = OrderedDict() + for sheet in self.parent.shortcut_sheets: + for donor in sheet.donors: + self.donors[donor.name] = donor + + @dictify + def get_input_files(self, action): + """Return path to somatic variant annotated file""" + # It only works with vep now. + # Validate action + self._validate_action(action) + tpl = ( + "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" + ) + #Need to change for work on many different tools + key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"} + variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"] + ngs_mapping = self.parent.sub_workflows['ngs_mapping'] + for key, ext in key_ext.items(): + yield key, variant_annotation(tpl + ext) + #Getting appropriate rna library + for sheet in filter(is_not_background, self.parent.sheets): + for donor in sheet.bio_entities.values(): + for biosample in donor.bio_samples.values(): + if biosample.extra_infos["isTumor"]: + for test_sample in biosample.test_samples.values(): + if test_sample.extra_infos["extractionType"] == "RNA": + for lib in test_sample.ngs_libraries.values(): + rna_library = lib.name + rna_tpl = ( + "output/{mapper}.{library_name}/out/{mapper}.{library_name}" + ).format( + mapper="star", + library_name=rna_library, + ) + ext = {"expression","bam","bai"} + yield 'expression',ngs_mapping(rna_tpl + ".GeneCounts.tab") + yield 'bam',ngs_mapping(rna_tpl + ".bam") + yield 'bai',ngs_mapping(rna_tpl + ".bam.bai") + + + @dictify + def get_output_files(self, action): + """Return output files """ + # Validate action + # Need to add step for adding RAD also + self._validate_action(action) + if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene": + prefix = ( + "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript": + prefix = ( + "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" + ) + key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} + for key, ext in key_ext.items(): + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + @dictify + def _get_log_file(self, action): + """Return mapping of log files.""" + # Validate action + self._validate_action(action) + if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene": + prefix = ( + "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript": + prefix = ( + "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" + ) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + + def get_resource_usage(self, action): + self._validate_action(action) + mem_mb = 6 * 1024 # 6GB + return ResourceUsage( + threads=2, + time="1:00:00", # 1 hour + memory=f"{mem_mb}M", + ) + + @listify + def get_result_files(self): + callers = set( + self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_somatic_variant_calling"] + ) + anno_callers = set( + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_annotation" + ] + ) + if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene": + name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}" + elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript": + name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}" + + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"], + var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + anno_caller=anno_callers, + ext=EXT_VALUES + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"], + var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + anno_caller=anno_callers, + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) + + def _yield_result_files_matched(self, tpl, **kwargs): + """Build output paths from path template and extension list. + + This function returns the results from the matched somatic variant callers such as + Mutect. + """ + for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + or not sample_pair.tumor_sample.rna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + yield from expand( + tpl, + tumor_library=[sample_pair.tumor_sample.dna_ngs_library], + **kwargs, + ) + + + +class SomaticNeoepitopePredictionWorkflow(BaseStep): + """Perform neoepitope prediction workflow""" + + name = "neoepitope_prediction" + sheet_shortcut_class = CancerCaseSheet + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one.""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + (SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, NgsMappingWorkflow), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart)) + self.register_sub_workflow( + "somatic_variant_annotation", + self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_somatic_variant_annotation"], + ) + self.register_sub_workflow( + "ngs_mapping", + self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"] + ) + # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_ngs_mapping" + ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_calling" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_calling" + ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_annotation" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_annotation" + ] = ["vep"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_rna_mapping" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_rna_mapping" + ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ + "path_features" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ + "path_features" + ] = self.w_config["static_data_config"]["features"]["path"] + + def get_result_files(self): + for sub_step in self.sub_steps.values(): + if sub_step.name not in (LinkOutStepPart.name,): + yield from sub_step.get_result_files() + + def check_config(self): + """Check that the path to the NGS mapping is present""" + self.ensure_w_config( + ("step_config", "somatic_neoepitope_prediction", "path_somatic_variant_annotation"), + "Path to variant (directory of vcf files) not configured but required for somatic neoepitope prediction", + ) + + self.ensure_w_config( + ("step_config", "somatic_neoepitope_prediction", "preparation","mode"), + "The mode is required for adding gene expression data to somatic variant annotated file", + ) + + self.ensure_w_config( + ("step_config", "somatic_neoepitope_prediction", "preparation","format"), + "Format is required for adding ", + ) diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py new file mode 100644 index 000000000..e941e12d2 --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -0,0 +1,392 @@ +import argparse +import sys +import vcfpy +import pyranges as pr, pandas as pd +import numpy as np +import re +from collections import OrderedDict +import logging + +########################## Reading vcf files ########################## +def create_vcf_reader(args): + vcf_reader = vcfpy.Reader.from_path(args.input_vcf) + if args.rna_vcf : + rna_vcf_reader = vcfpy.Reader.from_path(args.rna_vcf) + else : rna_vcf_reader=None + #expression part + is_multi_sample = len(vcf_reader.header.samples.names) > 1 + if is_multi_sample and args.sample_name is None: + vcf_reader.close() + raise Exception("ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format(args.input_vcf)) + elif is_multi_sample and args.sample_name not in vcf_reader.header.samples.names: + vcf_reader.close() + raise Exception("ERROR: VCF {} does not contain a sample column for sample {}.".format(args.input_vcf, args.sample_name)) + if 'CSQ' not in vcf_reader.header.info_ids(): + vcf_reader.close() + raise Exception("ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format(args.input_vcf)) + if args.mode == 'gene' and 'GX' in vcf_reader.header.format_ids(): + vcf_reader.close() + raise Exception("ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format(args.input_vcf)) + elif args.mode == 'transcript' and 'TX' in vcf_reader.header.format_ids(): + vcf_reader.close() + raise Exception("ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format(args.input_vcf)) + #snv part + + return vcf_reader, rna_vcf_reader, is_multi_sample +########################## Adding extra fields into Format ########################## +def create_vcf_writer(args, vcf_reader): + (head, sep, tail) = args.input_vcf.rpartition('.vcf') + new_header = vcf_reader.header.copy() + if args.mode == 'gene': + new_header.add_format_line(vcfpy.OrderedDict([('ID', 'GX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Gene Expressions')])) + output_file = ('').join([head, '.gx.vcf', tail]) + elif args.mode == 'transcript': + new_header.add_format_line(vcfpy.OrderedDict([('ID', 'TX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Transcript Expressions')])) + output_file = ('').join([head, '.tx.vcf', tail]) + if (args.rna_vcf) : + new_header.add_format_line(vcfpy.OrderedDict([ + ('ID','RAD'),('Number', '.'), ('Type', 'String'),('Description', 'Allelic depths from mRNA sequencing data of the same sample') + ])) + new_header.add_format_line(vcfpy.OrderedDict([ + ('ID','ROT'),('Number', '.'), ('Type', 'String'),('Description', 'Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample') + ])) + (head,sep,tail) = output_file.rpartition('.vcf') + output_file = ('').join([head, '.pileup.vcf', tail]) + if args.output_vcf: + output_file = args.output_vcf + return vcfpy.Writer.from_path(output_file, new_header) + +########################## Auto assign gene id column based on different tools ########################## +def resolve_id_column(args): + if args.format == 'cufflinks': + return 'tracking_id' + elif args.format == 'kallisto': + if args.mode == 'gene': + return 'gene' + elif args.mode == 'transcript': + return 'target_id' + elif args.format == 'stringtie': + if args.mode == 'gene': + return 'Gene ID' + elif args.format == 'star': + return "gene_id" # NEED change + elif args.format == 'custom': + if args.id_column is None: + raise Exception("ERROR: `--id-column` option is required when using the `custom` format") + else: + return args.id_column +########################## Auto assign expression column based on different tools ########################## +def resolve_expression_column(args): + if args.format == 'cufflinks': + return 'FPKM' + elif args.format == 'kallisto': + if args.mode == 'gene': + return 'abundance' + elif args.mode == 'transcript': + return 'tpm' + elif args.format == 'stringtie': + return 'TPM' + elif args.format == 'star': + if args.expression_column is None: + raise Exception("ERROR: `--expression-column` option is required when using the `star` format") + else: + return 'TPM_' + args.expression_column + elif (args.format == 'custom'): + if args.expression_column is None: + raise Exception("ERROR: `--expression-column` option is required when using the `custom` format") + else: + return args.expression_column + +def to_array(dictionary): + array = [] + for key, value in dictionary.items(): + array.append("{}|{}".format(key, value)) + return sorted(array) + +def resolve_stringtie_id_column(args, headers): + if 'gene_id' in headers: + return 'gene_id' + else: + return 'transcript_id' + +#Star TPM calculation +#If there is genecount file, GENECODE file is required to calculate TPM +##########################Star TPM Calculation########################## +def genecount_reader(genecount_path): + col_name=["gene_id","unstranded","rf","fr"] + star_genecount = pd.read_csv(genecount_path, + skiprows=4, + sep="\t", + names=col_name, + header=None, + ).sort_values(by=["gene_id"]) + #star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0]) + star_genecount.unstranded=star_genecount.unstranded.astype(int,copy=None) + star_genecount.rf=star_genecount.rf.astype(int,copy=None) + star_genecount.fr=star_genecount.fr.astype(int,copy=None) + #star_genecount.set_index("gene_id") + return star_genecount + +def genecode_reader(genecode_path,star_genecount,expression_column): + gc = pr.read_gtf(genecode_path,as_df=True) + gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])] + gc = gc[(gc.Feature=="gene")] + exon = gc[["Chromosome","Start","End","gene_id","gene_name"]] + # Convert columns to proper types. + exon.Start = exon.Start.astype(int) + exon.End = exon.End.astype(int) + # Sort in place. + exon.sort_values(by=['Chromosome','Start','End'], inplace=True) + # Group the rows by the Ensembl gene identifier (with version numbers.) + groups = exon.groupby('gene_id') + + lengths = groups.apply(count_bp) + # Create a new DataFrame with gene lengths and EnsemblID. + ensembl_no_version = lengths.index + ldf = pd.DataFrame({'length': lengths.values, + 'gene_id': ensembl_no_version}) + star_tpm = pd.merge(ldf,star_genecount, how='inner',on='gene_id') + star_tpm["TPM_" + expression_column]= calculate_TPM(star_tpm,expression_column) + return star_tpm + +def calculate_TPM(df,strand): + effLen = (df["length"])/1000 + rate = df[strand]/effLen + denom = sum(rate)/(10**6) + return rate/denom + +def count_bp(df): + """Given a DataFrame with the exon coordinates from Gencode for a single + gene, return the total number of coding bases in that gene. + Example: + >>> import numpy as np + >>> n = 3 + >>> r = lambda x: np.random.sample(x) * 10 + >>> d = pd.DataFrame([np.sort([a,b]) for a,b in zip(r(n), r(n))], columns=['start','end']).astype(int) + >>> d + start end + 0 6 9 + 1 3 4 + 2 4 9 + >>> count_bp(d) + 7 + Here is a visual representation of the 3 exons and the way they are added: + 123456789 Length + 0 ---- 4 + 1 -- 2 + 2 ------ 6 + ======= 7 + """ + start = df.Start.min() + end = df.End.max() + bp = [False] * (end - start + 1) + for i in range(df.shape[0]): + s = df.iloc[i]['Start'] - start + e = df.iloc[i]['End'] - start + 1 + bp[s:e] = [True] * (e - s) + return sum(bp) + +##########################Passing expression file########################## +def parse_expression_file(args, vcf_reader, vcf_writer): + if args.format == 'stringtie' and args.mode == 'transcript': + df_all = pr.read_gtf(args.expression_file) + df = df_all[df_all["feature"] == "transcript"] + id_column = resolve_stringtie_id_column(args, df.columns.values) + elif args.format == "star" : + id_column = resolve_id_column(args) + star_genecount = genecount_reader(args.expression_file) + df = genecode_reader(args.genecode,star_genecount,args.expression_column) + else: + id_column = resolve_id_column(args) + df = pd.read_csv(args.expression_file, sep='\t') + if args.ignore_ensembl_id_version: + df['transcript_without_version'] = df[id_column].apply(lambda x: re.sub(r'\.[0-9]+$', '', x)) + expression_column = resolve_expression_column(args) + if expression_column not in df.columns.values: + vcf_reader.close() + vcf_writer.close() + raise Exception("ERROR: expression_column header {} does not exist in expression_file {}".format(expression_column, args.expression_file)) + if id_column not in df.columns.values: + vcf_reader.close() + vcf_writer.close() + raise Exception("ERROR: id_column header {} does not exist in expression_file {}".format(id_column, args.expression_file)) + return df, id_column, expression_column + +##########################Adding RNA AD ########################## +def add_AD_rna_file(entry,is_multi_sample, sample_name,rna_vcf): + RAD_temp=[] + ROT = [] + for rna_record in rna_vcf.fetch(entry.CHROM,entry.affected_start,entry.affected_end): + rna_alt= [alt.value for alt in rna_record.ALT] + rna_AD = [c.data.get('AD') for c in rna_record.calls][0] + ROT_temp=sum(rna_AD) - rna_AD[0] + if not rna_alt: + continue + else: + rna_AD = [c.data.get('AD') for c in rna_record.calls][0] + for dna_alt in entry.ALT: + if dna_alt.value in rna_alt: + index=rna_record.ALT.index(dna_alt) + RAD_temp.append(rna_AD[0]) #AD of REF + RAD_temp.append(rna_AD[index+1]) #AD of ALT + ROT_temp = ROT_temp - rna_AD[index+1] + ROT.append(ROT_temp) + if is_multi_sample: + entry.FORMAT += ['RAD'] + entry.call_for_sample[sample_name].data['RAD'] = RAD_temp + entry.FORMAT += ['ROT'] + entry.call_for_sample[sample_name].data['ROT'] = ROT + else: + entry.add_format('RAD', RAD_temp) + entry.add_format('ROT', ROT) + return entry + +##########################Adding Expression file ########################## +def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_column, expression_column, ignore_ensembl_id_version, missing_expressions_count, entry_count): + expressions = {} + for item in items: + entry_count += 1 + if ignore_ensembl_id_version: + subset = df.loc[df['transcript_without_version'] == re.sub(r'\.[0-9]+$', '', item)] + else: + subset = df.loc[df[id_column] == item] + if len(subset) > 0: + expressions[item] = subset[expression_column].sum() + else: + missing_expressions_count += 1 + if is_multi_sample: + entry.FORMAT += [tag] + entry.call_for_sample[sample_name].data[tag] = to_array(expressions) + else: + entry.add_format(tag, to_array(expressions)) + return (entry, missing_expressions_count, entry_count) + +########################## Define tool parameters ########################## +def define_parser(): + parser = argparse.ArgumentParser( + "comb_rna.py", + description = "A tool that will add the data from several expression tools' output files" + + "and allelic depths from mRNA sequencing data for snvs" + + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star" + + "and Cufflinks. There also is a ``custom`` option to annotate with data " + + "from any tab-delimited file." + ) + + parser.add_argument( + "--input_vcf", + help="A VEP-annotated VCF file" + ) + parser.add_argument( + "--expression_file", + help="A TSV file containing expression estimates" + ) + parser.add_argument( + "--genecode", + help="A genecode file for calculate TPM from star gene count" + ) + parser.add_argument( + "--rna-vcf", + help="A VCF file of RNA-seq data", + ) + parser.add_argument( + "--format", + choices=['kallisto','stringtie','cufflinks','star','custom'], + help="The file format of the expression file to process. " + +"Use `custom` to process file formats not explicitly supported. " + +"The `custom` option requires the use of the --id-column and --expression-column arguments." + ) + parser.add_argument( + "--mode", + choices=['gene', 'transcript'], + help="The type of expression data in the expression_file" + ) + parser.add_argument( + "-i", "--id-column", + help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format." + ) + parser.add_argument( + "-e", "--expression-column", + help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format." + ) + parser.add_argument( + "-s", "--sample-name", + help="If the input_vcf contains multiple samples, the name of the sample to annotate." + ) + parser.add_argument( + "-o", "--output-vcf", + help="Path to write the output VCF file. If not provided, the output VCF file will be " + +"written next to the input VCF file with a .tx.vcf or .gx.vcf file ending." + ) + parser.add_argument( + "--ignore-ensembl-id-version", + help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").', + action="store_true" + ) + + return parser + +def main(args_input = sys.argv[1:]): + parser = define_parser() + args = parser.parse_args(args_input) + CONTIGS = ['chr{}'.format(i) for i in range(1, 23)] + ['chrX', 'chrY'] + if args.format == 'custom': + if args.id_column is None: + raise Exception("--id-column is not set. This is required when using the `custom` format.") + if args.expression_column is None: + raise Exception("--expression-column is not set. This is required when using the `custom` format.") + if args.format == 'star': + if args.genecode is None: + raise Exception("--genecode is not set. This is required when using the `star` format") + exp_col = ["unstranded","rf","fr"] + if args.expression_column not in exp_col: + raise Exception(""" + --expression-column is not found. Please choose : + unstranded - Assumes a unstranded library. + rf - Assumes a stranded library fr-firststrand. + fr - Assumes a stranded library fr-secondstrand. + """) + (vcf_reader,rna_vcf_reader, is_multi_sample) = create_vcf_reader(args) + format_pattern = re.compile('Format: (.*)') + csq_format = format_pattern.search(vcf_reader.header.get_info_field_info('CSQ').description).group(1).split('|') + + vcf_writer = create_vcf_writer(args, vcf_reader) + (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer) + missing_expressions_count = 0 + entry_count = 0 + for entry in vcf_reader: + #Add expression data + transcript_ids = set() + genes = set() + if 'CSQ' not in entry.INFO: + logging.warning("Variant is missing VEP annotation. INFO column doesn't contain CSQ field for variant {}".format(entry)) + vcf_writer.write_record(entry) + continue + for transcript in entry.INFO['CSQ']: + for key, value in zip(csq_format, transcript.split('|')): + if key == 'Feature' and value != '' and not value.startswith('ENSR'): + transcript_ids.add(value) + if key == 'Gene' and value != '': + genes.add(value) + + if args.mode == 'gene': + genes = list(genes) + if len(genes) > 0: + (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, genes, 'GX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count) + elif args.mode == 'transcript': + transcript_ids = list(transcript_ids) + if len(transcript_ids) > 0: + (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, transcript_ids, 'TX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count) + #Add RAD and ROT + if rna_vcf_reader is not None: + if (entry.CHROM in CONTIGS) and (entry.is_snv()): + entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name,rna_vcf_reader) + vcf_writer.write_record(entry) + + vcf_reader.close() + vcf_writer.close() + + if missing_expressions_count > 0: + logging.warning("{} of {} {}s did not have an expression entry for their {} id.".format(missing_expressions_count, entry_count, args.mode, args.mode)) + +if __name__ == '__main__': + main() diff --git a/snappy_wrappers/wrappers/pvactools/combining/environment.yaml b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml new file mode 100644 index 000000000..2ea441ab8 --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda +dependencies: + - vcfpy + - numpy + - pyranges + - pandas + - python + - bcftools diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py new file mode 100644 index 000000000..1dd23a267 --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +""" +Wrapper for preparation the vcf file for somatic neoepitope prediction +""" + +import os +from snakemake.shell import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +exp_format = config['preparation']['format'] +preparation_vcf = os.path.join( + os.path.dirname(__file__), + "comb_rna.py", +) +ensemble_id = ( + "--ignore-ensembl-id-version" + if config['preparation']['ignore-ensembl-id-version'] + else "" +) + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi +# ----------------------------------------------------------------------------- +# Create auto-cleaned temporary directory +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +# Getting region of SNVs only +bcftools filter -i 'TYPE="snp"' {snakemake.input.vcf} | bcftools query -f '%CHROM\t%POS0\t%END\n' > $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed + +# Getting snvs from RNA sequencing data +bcftools mpileup -Ov \ + --annotate FORMAT/AD,FORMAT/DP \ + -R $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed \ + -f {snakemake.config[static_data_config][reference][path]} \ + --per-sample-mF \ + --max-depth 4000 \ + --redo-BAQ -Oz \ + -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \ + {snakemake.input.bam} +tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz + +#Need to add option for RNA vcf as well +python3 -W ignore {preparation_vcf} \ + --genecode {snakemake.config[static_data_config][features][path]} \ + --input_vcf {snakemake.input.vcf} --format {config[preparation][format]} \ + --rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \ + --expression_file {snakemake.input.expression} \ + --mode {config[preparation][mode]} \ + -e {config[preparation][expression-column]} \ + -s {snakemake.wildcards.tumor_library} \ + -o /dev/stdout \ + "{ensemble_id}" \ +| bgzip -c > {snakemake.output.vcf} +tabix -f {snakemake.output.vcf} + +pushd $(dirname {snakemake.output.vcf}) +md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 +md5sum $(basename {snakemake.output.vcf_tbi}) >$(basename {snakemake.output.vcf_tbi}).md5 +""" +) diff --git a/snappy_wrappers/wrappers/pvactools/environment.yaml b/snappy_wrappers/wrappers/pvactools/environment.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py new file mode 100644 index 000000000..e69de29bb diff --git a/snappy_wrappers/wrappers/stringtie/environment.yaml b/snappy_wrappers/wrappers/stringtie/environment.yaml new file mode 100644 index 000000000..b79f83c81 --- /dev/null +++ b/snappy_wrappers/wrappers/stringtie/environment.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - stringtie==2.2.1 \ No newline at end of file diff --git a/snappy_wrappers/wrappers/stringtie/wrapper.py b/snappy_wrappers/wrappers/stringtie/wrapper.py new file mode 100644 index 000000000..f71436713 --- /dev/null +++ b/snappy_wrappers/wrappers/stringtie/wrapper.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" +Wrapper for quantify gene and transcript expression +""" + +from snakemake.shell import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +exec 2> >(tee -a "{snakemake.log.log}") +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + + +stringtie \ + -G {snakemake.config[static_data_config][features][path]} \ + -p 4 \ + -v \ + -o {snakemake.output.expression} \ + {snakemake.input.rna_bam} + +pushd $(dirname {snakemake.output.expression}) +md5sum $(basename {snakemake.output.expression}) > $(basename {snakemake.output.expression_md5}) +""" +) + + +# Compute MD5 sums of logs +shell( + r""" +md5sum {snakemake.log.log} > {snakemake.log.log_md5} +md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5} +""" +) \ No newline at end of file diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py new file mode 100644 index 000000000..eac6138d7 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py @@ -0,0 +1,218 @@ +# -*- coding: utf-8 -*- +"""Tests for the somatic_variant_calling workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml + +from snappy_pipeline.workflows.somatic_neoepitope_prediction import ( + SomaticNeoepitopePredictionWorkflow, +) + +from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .conftest import patch_module_fs + + +# Test tumor mutational burden calculation with vcf file from somatic variant calling step +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + features: + path: /path/to/genecode.gtf + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + rna: ['star'] + star: + path_index: /path/to/index + path_features: + bwa: + path_index: /path/to/bwa/index.fa + + somatic_variant_calling: + tools: + - mutect2 + - scalpel + scalpel: + path_target_regions: /path/to/target/regions.bed + + somatic_variant_annotation: + tools: ["vep"] + vep: + path_dir_cache: /path/to/dir/cache + + somatic_neoepitope_prediction: + path_somatic_variant_annotation: ../somatic_variant_annotation + path_rna_ngs_mapping: ../ngs_mapping + tools_somatic_variant_annotation: [] + tools_rna_mapping: [] + tools_ngs_mapping: [] + tools_somatic_variant_calling: [] + max_depth: "4000" + preparation: + format: 'star' + path_features: '' + mode: 'gene' + id-column: '' + expression-column: 'fr' + ignore-ensembl-id-version: True + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + +@pytest.fixture +def somatic_neoepitope_prediction_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x, + } + # Construct the workflow object + return SomaticNeoepitopePredictionWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + +def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_prediction_workflow): + # Define expected + vcf_base_out = ( + "SOMATIC_VARIANT_ANNOTATION/output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" + ) + ngs_mapping_base_out = "NGS_MAPPING/output/star.P002-T2-RNA1-mRNA_seq1/out/" + expected = { + "vcf": vcf_base_out + ".full.vcf.gz", + "vcf_tbi": vcf_base_out + ".full.vcf.gz.tbi", + "expression": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.GeneCounts.tab", + "bam": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam", + "bai": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam.bai", + } + + # Get actual + actual = somatic_neoepitope_prediction_workflow.get_input_files("neoepitope_preparation", "run") + assert actual == expected + +def test_neoepitope_preparation_step_part_get_output_files(somatic_neoepitope_prediction_workflow): + base_out = ( + "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + expected = get_expected_output_vcf_files_dict(base_out) + actual = somatic_neoepitope_prediction_workflow.get_output_files("neoepitope_preparation", "run") + assert actual == expected + +def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_prediction_workflow): + base_out = ( + "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = somatic_neoepitope_prediction_workflow.get_log_file("neoepitope_preparation", "run") + assert actual == expected + +def test_neoepitope_preparation_step_part_get_resource_usage(somatic_neoepitope_prediction_workflow): + # Define expected + expected_dict = {"threads": 2, "time": "1:00:00", "memory": "6144M"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_neoepitope_prediction_workflow.get_resource("neoepitope_preparation", "run", resource) + assert actual == expected, msg_error + +def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["link_out", "neoepitope_preparation"] + actual = list(sorted(somatic_neoepitope_prediction_workflow.sub_steps.keys())) + assert actual == expected + + # Check result file construction + tpl = ( + "output/{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1/{dir_}/" + "{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1.{ext}" + ) + expected = [ + tpl.format( + mapper=mapper, + var_caller=var_caller, + anno_caller=anno_caller, + i=i, + t=t, + ext=ext, + dir_="out", + ) + for i,t in ((1, 1), (2, 2)) + for ext in ( + "vcf.gz", + "vcf.gz.tbi", + "vcf.gz.md5", + "vcf.gz.tbi.md5" + ) + for mapper in("bwa",) + for var_caller in ("mutect2", "scalpel") + for anno_caller in ("vep",) + ] + + expected += [ + tpl.format( + mapper=mapper, + var_caller=var_caller, + anno_caller=anno_caller, + i=i, + t=t, + ext=ext, + dir_="log", + ) + for i,t in ((1, 1), (2, 2)) + for ext in ( + "conda_info.txt", + "conda_list.txt", + "log", + "conda_info.txt.md5", + "conda_list.txt.md5", + "log.md5", + ) + for mapper in("bwa",) + for var_caller in ("mutect2", "scalpel") + for anno_caller in ("vep",) + ] + expected = list(sorted(expected)) + actual = list(sorted(somatic_neoepitope_prediction_workflow.get_result_files())) + assert expected == actual From 4eab17a80b7dfedf26ffe013d811a6dc25d60228 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 4 Apr 2024 13:20:58 +0200 Subject: [PATCH 02/23] Satisfying code format --- .../somatic_neoepitope_prediction/Snakefile | 20 +- .../somatic_neoepitope_prediction/__init__.py | 135 +++-- .../wrappers/pvactools/combining/comb_rna.py | 521 ++++++++++++------ .../wrappers/pvactools/combining/wrapper.py | 4 +- 4 files changed, 438 insertions(+), 242 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile index 0d1870dd9..0fcffcce3 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -38,6 +38,7 @@ rule all: # Generic linking out --------------------------------------------------------- + rule neoepitope_preparation_link_out_run: input: wf.get_input_files("link_out", "run"), @@ -46,23 +47,6 @@ rule neoepitope_preparation_link_out_run: run: shell(wf.get_shell_cmd("link_out", "run", wildcards)) -# rule rna_pileup: -# input: -# **wf.get_input_files("rna_pileup","run") -# output: -# **wf.get_output_files("rna_pileup","run") -# log: -# **wf.get_log_file("rna_pileup", "run"), -# threads: -# wf.get_resource("rna_pileup", "run", "threads"), -# resources: -# time=wf.get_resource("rna_pileup", "run", "time"), -# memory=wf.get_resource("rna_pileup", "run", "memory"), -# partition=wf.get_resource("rna_pileup", "run", "partition"), -# tmpdir=wf.get_resource("rna_pileup", "run", "tmpdir"), -# wrapper: -# wf.wrapper_path("pvactools/mpileup") - rule neoepitope_preparation: input: @@ -71,7 +55,7 @@ rule neoepitope_preparation: **wf.get_output_files("neoepitope_preparation", "run"), log: **wf.get_log_file("neoepitope_preparation", "run"), - threads:wf.get_resource("neoepitope_preparation", "run", "threads"), + threads: wf.get_resource("neoepitope_preparation", "run", "threads") resources: time=wf.get_resource("neoepitope_preparation", "run", "time"), memory=wf.get_resource("neoepitope_preparation", "run", "memory"), diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index 69afb23e9..6366c5577 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -35,23 +35,25 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from biomedsheets.shortcuts import ( + CancerCaseSheet, + CancerCaseSheetOptions, + is_not_background, +) from snakemake.io import expand from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import ( - READ_MAPPERS_RNA, NgsMappingWorkflow, - ResourceUsage + ResourceUsage, ) from snappy_pipeline.workflows.somatic_variant_calling import ( SOMATIC_VARIANT_CALLERS_MATCHED, SomaticVariantCallingWorkflow, ) from snappy_pipeline.workflows.somatic_variant_annotation import ( - ANNOTATION_TOOLS, - SomaticVariantAnnotationWorkflow + SomaticVariantAnnotationWorkflow, ) @@ -92,6 +94,7 @@ class NeoepitopePreparationStepPart(BaseStepPart): """ Preparation VCF file for pvactool """ + #: Step name name = "neoepitope_preparation" actions = ("run",) @@ -120,13 +123,13 @@ def get_input_files(self, action): "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" ) - #Need to change for work on many different tools + # Need to change for work on many different tools key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"} variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"] - ngs_mapping = self.parent.sub_workflows['ngs_mapping'] + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] for key, ext in key_ext.items(): yield key, variant_annotation(tpl + ext) - #Getting appropriate rna library + # Getting appropriate rna library for sheet in filter(is_not_background, self.parent.sheets): for donor in sheet.bio_entities.values(): for biosample in donor.bio_samples.values(): @@ -141,24 +144,35 @@ def get_input_files(self, action): mapper="star", library_name=rna_library, ) - ext = {"expression","bam","bai"} - yield 'expression',ngs_mapping(rna_tpl + ".GeneCounts.tab") - yield 'bam',ngs_mapping(rna_tpl + ".bam") - yield 'bai',ngs_mapping(rna_tpl + ".bam.bai") - + ext = {"expression", "bam", "bai"} + yield "expression", ngs_mapping( + rna_tpl + ".GeneCounts.tab" + ) + yield "bam", ngs_mapping(rna_tpl + ".bam") + yield "bai", ngs_mapping(rna_tpl + ".bam.bai") @dictify def get_output_files(self, action): - """Return output files """ + """Return output files""" # Validate action # Need to add step for adding RAD also self._validate_action(action) - if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene": + if ( + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["mode"] + == "gene" + ): prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) - elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript": + elif ( + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["mode"] + == "transcript" + ): prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" @@ -173,12 +187,22 @@ def _get_log_file(self, action): """Return mapping of log files.""" # Validate action self._validate_action(action) - if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene": + if ( + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["mode"] + == "gene" + ): prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) - elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript": + elif ( + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["mode"] + == "transcript" + ): prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" @@ -203,28 +227,44 @@ def get_resource_usage(self, action): @listify def get_result_files(self): callers = set( - self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_somatic_variant_calling"] + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_calling" + ] ) anno_callers = set( self.w_config["step_config"]["somatic_neoepitope_prediction"][ "tools_somatic_variant_annotation" ] ) - if self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene": + if ( + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["mode"] + == "gene" + ): name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}" - elif self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript": + elif ( + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["mode"] + == "transcript" + ): name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}" yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"], + mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_ngs_mapping" + ], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, - ext=EXT_VALUES + ext=EXT_VALUES, ) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"], + mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_ngs_mapping" + ], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, ext=( @@ -263,14 +303,15 @@ def _yield_result_files_matched(self, tpl, **kwargs): ) - class SomaticNeoepitopePredictionWorkflow(BaseStep): """Perform neoepitope prediction workflow""" name = "neoepitope_prediction" sheet_shortcut_class = CancerCaseSheet sheet_shortcut_kwargs = { - "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + "options": CancerCaseSheetOptions( + allow_missing_normal=True, allow_missing_tumor=True + ) } @classmethod @@ -285,20 +326,30 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, NgsMappingWorkflow), + ( + SomaticVariantCallingWorkflow, + SomaticVariantAnnotationWorkflow, + NgsMappingWorkflow, + ), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart)) self.register_sub_workflow( "somatic_variant_annotation", - self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_somatic_variant_annotation"], + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "path_somatic_variant_annotation" + ], ) self.register_sub_workflow( "ngs_mapping", - self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"] + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "path_rna_ngs_mapping" + ], ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]: + if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_ngs_mapping" + ]: self.w_config["step_config"]["somatic_neoepitope_prediction"][ "tools_ngs_mapping" ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] @@ -318,14 +369,14 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) "tools_rna_mapping" ]: self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_rna_mapping" + "tools_rna_mapping" ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ - "path_features" - ]: - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ - "path_features" - ] = self.w_config["static_data_config"]["features"]["path"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["path_features"]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "preparation" + ]["path_features"] = self.w_config["static_data_config"]["features"]["path"] def get_result_files(self): for sub_step in self.sub_steps.values(): @@ -335,16 +386,20 @@ def get_result_files(self): def check_config(self): """Check that the path to the NGS mapping is present""" self.ensure_w_config( - ("step_config", "somatic_neoepitope_prediction", "path_somatic_variant_annotation"), + ( + "step_config", + "somatic_neoepitope_prediction", + "path_somatic_variant_annotation", + ), "Path to variant (directory of vcf files) not configured but required for somatic neoepitope prediction", ) self.ensure_w_config( - ("step_config", "somatic_neoepitope_prediction", "preparation","mode"), + ("step_config", "somatic_neoepitope_prediction", "preparation", "mode"), "The mode is required for adding gene expression data to somatic variant annotated file", ) self.ensure_w_config( - ("step_config", "somatic_neoepitope_prediction", "preparation","format"), + ("step_config", "somatic_neoepitope_prediction", "preparation", "format"), "Format is required for adding ", ) diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py index e941e12d2..ef40f0151 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -1,159 +1,232 @@ import argparse import sys import vcfpy -import pyranges as pr, pandas as pd -import numpy as np +import pyranges as pr +import pandas as pd import re -from collections import OrderedDict import logging + ########################## Reading vcf files ########################## def create_vcf_reader(args): vcf_reader = vcfpy.Reader.from_path(args.input_vcf) - if args.rna_vcf : + if args.rna_vcf: rna_vcf_reader = vcfpy.Reader.from_path(args.rna_vcf) - else : rna_vcf_reader=None - #expression part + else: + rna_vcf_reader = None + # expression part is_multi_sample = len(vcf_reader.header.samples.names) > 1 if is_multi_sample and args.sample_name is None: vcf_reader.close() - raise Exception("ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format(args.input_vcf)) + raise Exception( + "ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format( + args.input_vcf + ) + ) elif is_multi_sample and args.sample_name not in vcf_reader.header.samples.names: vcf_reader.close() - raise Exception("ERROR: VCF {} does not contain a sample column for sample {}.".format(args.input_vcf, args.sample_name)) - if 'CSQ' not in vcf_reader.header.info_ids(): + raise Exception( + "ERROR: VCF {} does not contain a sample column for sample {}.".format( + args.input_vcf, args.sample_name + ) + ) + if "CSQ" not in vcf_reader.header.info_ids(): vcf_reader.close() - raise Exception("ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format(args.input_vcf)) - if args.mode == 'gene' and 'GX' in vcf_reader.header.format_ids(): + raise Exception( + "ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format( + args.input_vcf + ) + ) + if args.mode == "gene" and "GX" in vcf_reader.header.format_ids(): vcf_reader.close() - raise Exception("ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format(args.input_vcf)) - elif args.mode == 'transcript' and 'TX' in vcf_reader.header.format_ids(): + raise Exception( + "ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format( + args.input_vcf + ) + ) + elif args.mode == "transcript" and "TX" in vcf_reader.header.format_ids(): vcf_reader.close() - raise Exception("ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format(args.input_vcf)) - #snv part + raise Exception( + "ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format( + args.input_vcf + ) + ) + # snv part return vcf_reader, rna_vcf_reader, is_multi_sample + + ########################## Adding extra fields into Format ########################## def create_vcf_writer(args, vcf_reader): - (head, sep, tail) = args.input_vcf.rpartition('.vcf') + (head, sep, tail) = args.input_vcf.rpartition(".vcf") new_header = vcf_reader.header.copy() - if args.mode == 'gene': - new_header.add_format_line(vcfpy.OrderedDict([('ID', 'GX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Gene Expressions')])) - output_file = ('').join([head, '.gx.vcf', tail]) - elif args.mode == 'transcript': - new_header.add_format_line(vcfpy.OrderedDict([('ID', 'TX'), ('Number', '.'), ('Type', 'String'), ('Description', 'Transcript Expressions')])) - output_file = ('').join([head, '.tx.vcf', tail]) - if (args.rna_vcf) : - new_header.add_format_line(vcfpy.OrderedDict([ - ('ID','RAD'),('Number', '.'), ('Type', 'String'),('Description', 'Allelic depths from mRNA sequencing data of the same sample') - ])) - new_header.add_format_line(vcfpy.OrderedDict([ - ('ID','ROT'),('Number', '.'), ('Type', 'String'),('Description', 'Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample') - ])) - (head,sep,tail) = output_file.rpartition('.vcf') - output_file = ('').join([head, '.pileup.vcf', tail]) + if args.mode == "gene": + new_header.add_format_line( + vcfpy.OrderedDict( + [ + ("ID", "GX"), + ("Number", "."), + ("Type", "String"), + ("Description", "Gene Expressions"), + ] + ) + ) + output_file = ("").join([head, ".gx.vcf", tail]) + elif args.mode == "transcript": + new_header.add_format_line( + vcfpy.OrderedDict( + [ + ("ID", "TX"), + ("Number", "."), + ("Type", "String"), + ("Description", "Transcript Expressions"), + ] + ) + ) + output_file = ("").join([head, ".tx.vcf", tail]) + if args.rna_vcf: + new_header.add_format_line( + vcfpy.OrderedDict( + [ + ("ID", "RAD"), + ("Number", "."), + ("Type", "String"), + ( + "Description", + "Allelic depths from mRNA sequencing data of the same sample", + ), + ] + ) + ) + new_header.add_format_line( + vcfpy.OrderedDict( + [ + ("ID", "ROT"), + ("Number", "."), + ("Type", "String"), + ( + "Description", + "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample", + ), + ] + ) + ) + (head, sep, tail) = output_file.rpartition(".vcf") + output_file = ("").join([head, ".pileup.vcf", tail]) if args.output_vcf: output_file = args.output_vcf return vcfpy.Writer.from_path(output_file, new_header) + ########################## Auto assign gene id column based on different tools ########################## def resolve_id_column(args): - if args.format == 'cufflinks': - return 'tracking_id' - elif args.format == 'kallisto': - if args.mode == 'gene': - return 'gene' - elif args.mode == 'transcript': - return 'target_id' - elif args.format == 'stringtie': - if args.mode == 'gene': - return 'Gene ID' - elif args.format == 'star': - return "gene_id" # NEED change - elif args.format == 'custom': + if args.format == "cufflinks": + return "tracking_id" + elif args.format == "kallisto": + if args.mode == "gene": + return "gene" + elif args.mode == "transcript": + return "target_id" + elif args.format == "stringtie": + if args.mode == "gene": + return "Gene ID" + elif args.format == "star": + return "gene_id" # NEED change + elif args.format == "custom": if args.id_column is None: - raise Exception("ERROR: `--id-column` option is required when using the `custom` format") + raise Exception( + "ERROR: `--id-column` option is required when using the `custom` format" + ) else: return args.id_column + + ########################## Auto assign expression column based on different tools ########################## def resolve_expression_column(args): - if args.format == 'cufflinks': - return 'FPKM' - elif args.format == 'kallisto': - if args.mode == 'gene': - return 'abundance' - elif args.mode == 'transcript': - return 'tpm' - elif args.format == 'stringtie': - return 'TPM' - elif args.format == 'star': + if args.format == "cufflinks": + return "FPKM" + elif args.format == "kallisto": + if args.mode == "gene": + return "abundance" + elif args.mode == "transcript": + return "tpm" + elif args.format == "stringtie": + return "TPM" + elif args.format == "star": if args.expression_column is None: - raise Exception("ERROR: `--expression-column` option is required when using the `star` format") + raise Exception( + "ERROR: `--expression-column` option is required when using the `star` format" + ) else: - return 'TPM_' + args.expression_column - elif (args.format == 'custom'): + return "TPM_" + args.expression_column + elif args.format == "custom": if args.expression_column is None: - raise Exception("ERROR: `--expression-column` option is required when using the `custom` format") + raise Exception( + "ERROR: `--expression-column` option is required when using the `custom` format" + ) else: return args.expression_column + def to_array(dictionary): array = [] for key, value in dictionary.items(): array.append("{}|{}".format(key, value)) return sorted(array) + def resolve_stringtie_id_column(args, headers): - if 'gene_id' in headers: - return 'gene_id' + if "gene_id" in headers: + return "gene_id" else: - return 'transcript_id' + return "transcript_id" + -#Star TPM calculation -#If there is genecount file, GENECODE file is required to calculate TPM -##########################Star TPM Calculation########################## def genecount_reader(genecount_path): - col_name=["gene_id","unstranded","rf","fr"] - star_genecount = pd.read_csv(genecount_path, - skiprows=4, - sep="\t", - names=col_name, - header=None, - ).sort_values(by=["gene_id"]) - #star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0]) - star_genecount.unstranded=star_genecount.unstranded.astype(int,copy=None) - star_genecount.rf=star_genecount.rf.astype(int,copy=None) - star_genecount.fr=star_genecount.fr.astype(int,copy=None) - #star_genecount.set_index("gene_id") + col_name = ["gene_id", "unstranded", "rf", "fr"] + star_genecount = pd.read_csv( + genecount_path, + skiprows=4, + sep="\t", + names=col_name, + header=None, + ).sort_values(by=["gene_id"]) + # star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0]) + star_genecount.unstranded = star_genecount.unstranded.astype(int, copy=None) + star_genecount.rf = star_genecount.rf.astype(int, copy=None) + star_genecount.fr = star_genecount.fr.astype(int, copy=None) + # star_genecount.set_index("gene_id") return star_genecount -def genecode_reader(genecode_path,star_genecount,expression_column): - gc = pr.read_gtf(genecode_path,as_df=True) + +def genecode_reader(genecode_path, star_genecount, expression_column): + gc = pr.read_gtf(genecode_path, as_df=True) gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])] - gc = gc[(gc.Feature=="gene")] - exon = gc[["Chromosome","Start","End","gene_id","gene_name"]] + gc = gc[(gc.Feature == "gene")] + exon = gc[["Chromosome", "Start", "End", "gene_id", "gene_name"]] # Convert columns to proper types. exon.Start = exon.Start.astype(int) exon.End = exon.End.astype(int) # Sort in place. - exon.sort_values(by=['Chromosome','Start','End'], inplace=True) + exon.sort_values(by=["Chromosome", "Start", "End"], inplace=True) # Group the rows by the Ensembl gene identifier (with version numbers.) - groups = exon.groupby('gene_id') + groups = exon.groupby("gene_id") lengths = groups.apply(count_bp) # Create a new DataFrame with gene lengths and EnsemblID. ensembl_no_version = lengths.index - ldf = pd.DataFrame({'length': lengths.values, - 'gene_id': ensembl_no_version}) - star_tpm = pd.merge(ldf,star_genecount, how='inner',on='gene_id') - star_tpm["TPM_" + expression_column]= calculate_TPM(star_tpm,expression_column) + ldf = pd.DataFrame({"length": lengths.values, "gene_id": ensembl_no_version}) + star_tpm = pd.merge(ldf, star_genecount, how="inner", on="gene_id") + star_tpm["TPM_" + expression_column] = calculate_TPM(star_tpm, expression_column) return star_tpm -def calculate_TPM(df,strand): - effLen = (df["length"])/1000 - rate = df[strand]/effLen - denom = sum(rate)/(10**6) - return rate/denom + +def calculate_TPM(df, strand): + effLen = (df["length"]) / 1000 + rate = df[strand] / effLen + denom = sum(rate) / (10**6) + return rate / denom + def count_bp(df): """Given a DataFrame with the exon coordinates from Gencode for a single @@ -181,73 +254,99 @@ def count_bp(df): end = df.End.max() bp = [False] * (end - start + 1) for i in range(df.shape[0]): - s = df.iloc[i]['Start'] - start - e = df.iloc[i]['End'] - start + 1 + s = df.iloc[i]["Start"] - start + e = df.iloc[i]["End"] - start + 1 bp[s:e] = [True] * (e - s) return sum(bp) -##########################Passing expression file########################## + def parse_expression_file(args, vcf_reader, vcf_writer): - if args.format == 'stringtie' and args.mode == 'transcript': + if args.format == "stringtie" and args.mode == "transcript": df_all = pr.read_gtf(args.expression_file) df = df_all[df_all["feature"] == "transcript"] id_column = resolve_stringtie_id_column(args, df.columns.values) - elif args.format == "star" : + elif args.format == "star": id_column = resolve_id_column(args) star_genecount = genecount_reader(args.expression_file) - df = genecode_reader(args.genecode,star_genecount,args.expression_column) + df = genecode_reader(args.genecode, star_genecount, args.expression_column) else: id_column = resolve_id_column(args) - df = pd.read_csv(args.expression_file, sep='\t') + df = pd.read_csv(args.expression_file, sep="\t") if args.ignore_ensembl_id_version: - df['transcript_without_version'] = df[id_column].apply(lambda x: re.sub(r'\.[0-9]+$', '', x)) + df["transcript_without_version"] = df[id_column].apply( + lambda x: re.sub(r"\.[0-9]+$", "", x) + ) expression_column = resolve_expression_column(args) if expression_column not in df.columns.values: vcf_reader.close() vcf_writer.close() - raise Exception("ERROR: expression_column header {} does not exist in expression_file {}".format(expression_column, args.expression_file)) + raise Exception( + "ERROR: expression_column header {} does not exist in expression_file {}".format( + expression_column, args.expression_file + ) + ) if id_column not in df.columns.values: vcf_reader.close() vcf_writer.close() - raise Exception("ERROR: id_column header {} does not exist in expression_file {}".format(id_column, args.expression_file)) + raise Exception( + "ERROR: id_column header {} does not exist in expression_file {}".format( + id_column, args.expression_file + ) + ) return df, id_column, expression_column -##########################Adding RNA AD ########################## -def add_AD_rna_file(entry,is_multi_sample, sample_name,rna_vcf): - RAD_temp=[] + +def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf): + RAD_temp = [] ROT = [] - for rna_record in rna_vcf.fetch(entry.CHROM,entry.affected_start,entry.affected_end): - rna_alt= [alt.value for alt in rna_record.ALT] - rna_AD = [c.data.get('AD') for c in rna_record.calls][0] - ROT_temp=sum(rna_AD) - rna_AD[0] + for rna_record in rna_vcf.fetch( + entry.CHROM, entry.affected_start, entry.affected_end + ): + rna_alt = [alt.value for alt in rna_record.ALT] + rna_AD = [c.data.get("AD") for c in rna_record.calls][0] + ROT_temp = sum(rna_AD) - rna_AD[0] if not rna_alt: continue else: - rna_AD = [c.data.get('AD') for c in rna_record.calls][0] + rna_AD = [c.data.get("AD") for c in rna_record.calls][0] for dna_alt in entry.ALT: if dna_alt.value in rna_alt: - index=rna_record.ALT.index(dna_alt) - RAD_temp.append(rna_AD[0]) #AD of REF - RAD_temp.append(rna_AD[index+1]) #AD of ALT - ROT_temp = ROT_temp - rna_AD[index+1] + index = rna_record.ALT.index(dna_alt) + RAD_temp.append(rna_AD[0]) # AD of REF + RAD_temp.append(rna_AD[index + 1]) # AD of ALT + ROT_temp = ROT_temp - rna_AD[index + 1] ROT.append(ROT_temp) if is_multi_sample: - entry.FORMAT += ['RAD'] - entry.call_for_sample[sample_name].data['RAD'] = RAD_temp - entry.FORMAT += ['ROT'] - entry.call_for_sample[sample_name].data['ROT'] = ROT + entry.FORMAT += ["RAD"] + entry.call_for_sample[sample_name].data["RAD"] = RAD_temp + entry.FORMAT += ["ROT"] + entry.call_for_sample[sample_name].data["ROT"] = ROT else: - entry.add_format('RAD', RAD_temp) - entry.add_format('ROT', ROT) + entry.add_format("RAD", RAD_temp) + entry.add_format("ROT", ROT) return entry -##########################Adding Expression file ########################## -def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_column, expression_column, ignore_ensembl_id_version, missing_expressions_count, entry_count): + +def add_expressions( + entry, + is_multi_sample, + sample_name, + df, + items, + tag, + id_column, + expression_column, + ignore_ensembl_id_version, + missing_expressions_count, + entry_count, +): expressions = {} for item in items: entry_count += 1 if ignore_ensembl_id_version: - subset = df.loc[df['transcript_without_version'] == re.sub(r'\.[0-9]+$', '', item)] + subset = df.loc[ + df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item) + ] else: subset = df.loc[df[id_column] == item] if len(subset) > 0: @@ -261,28 +360,24 @@ def add_expressions(entry, is_multi_sample, sample_name, df, items, tag, id_colu entry.add_format(tag, to_array(expressions)) return (entry, missing_expressions_count, entry_count) + ########################## Define tool parameters ########################## def define_parser(): parser = argparse.ArgumentParser( "comb_rna.py", - description = "A tool that will add the data from several expression tools' output files" + - "and allelic depths from mRNA sequencing data for snvs" + - "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star" + - "and Cufflinks. There also is a ``custom`` option to annotate with data " + - "from any tab-delimited file." + description="A tool that will add the data from several expression tools' output files" + + "and allelic depths from mRNA sequencing data for snvs" + + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star" + + "and Cufflinks. There also is a ``custom`` option to annotate with data " + + "from any tab-delimited file.", ) + parser.add_argument("--input_vcf", help="A VEP-annotated VCF file") parser.add_argument( - "--input_vcf", - help="A VEP-annotated VCF file" - ) - parser.add_argument( - "--expression_file", - help="A TSV file containing expression estimates" + "--expression_file", help="A TSV file containing expression estimates" ) parser.add_argument( - "--genecode", - help="A genecode file for calculate TPM from star gene count" + "--genecode", help="A genecode file for calculate TPM from star gene count" ) parser.add_argument( "--rna-vcf", @@ -290,103 +385,165 @@ def define_parser(): ) parser.add_argument( "--format", - choices=['kallisto','stringtie','cufflinks','star','custom'], + choices=["kallisto", "stringtie", "cufflinks", "star", "custom"], help="The file format of the expression file to process. " - +"Use `custom` to process file formats not explicitly supported. " - +"The `custom` option requires the use of the --id-column and --expression-column arguments." + + "Use `custom` to process file formats not explicitly supported. " + + "The `custom` option requires the use of the --id-column and --expression-column arguments.", ) parser.add_argument( "--mode", - choices=['gene', 'transcript'], - help="The type of expression data in the expression_file" + choices=["gene", "transcript"], + help="The type of expression data in the expression_file", ) parser.add_argument( - "-i", "--id-column", - help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format." + "-i", + "--id-column", + help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.", ) parser.add_argument( - "-e", "--expression-column", - help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format." + "-e", + "--expression-column", + help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format.", ) parser.add_argument( - "-s", "--sample-name", - help="If the input_vcf contains multiple samples, the name of the sample to annotate." + "-s", + "--sample-name", + help="If the input_vcf contains multiple samples, the name of the sample to annotate.", ) parser.add_argument( - "-o", "--output-vcf", + "-o", + "--output-vcf", help="Path to write the output VCF file. If not provided, the output VCF file will be " - +"written next to the input VCF file with a .tx.vcf or .gx.vcf file ending." + + "written next to the input VCF file with a .tx.vcf or .gx.vcf file ending.", ) parser.add_argument( "--ignore-ensembl-id-version", help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").', - action="store_true" + action="store_true", ) return parser -def main(args_input = sys.argv[1:]): - parser = define_parser() - args = parser.parse_args(args_input) - CONTIGS = ['chr{}'.format(i) for i in range(1, 23)] + ['chrX', 'chrY'] - if args.format == 'custom': + +def args_check(args): + if args.format == "custom": if args.id_column is None: - raise Exception("--id-column is not set. This is required when using the `custom` format.") + raise Exception( + "--id-column is not set. This is required when using the `custom` format." + ) if args.expression_column is None: - raise Exception("--expression-column is not set. This is required when using the `custom` format.") - if args.format == 'star': + raise Exception( + "--expression-column is not set. This is required when using the `custom` format." + ) + if args.format == "star": if args.genecode is None: - raise Exception("--genecode is not set. This is required when using the `star` format") - exp_col = ["unstranded","rf","fr"] + raise Exception( + "--genecode is not set. This is required when using the `star` format" + ) + exp_col = ["unstranded", "rf", "fr"] if args.expression_column not in exp_col: - raise Exception(""" + raise Exception( + """ --expression-column is not found. Please choose : unstranded - Assumes a unstranded library. rf - Assumes a stranded library fr-firststrand. fr - Assumes a stranded library fr-secondstrand. - """) - (vcf_reader,rna_vcf_reader, is_multi_sample) = create_vcf_reader(args) - format_pattern = re.compile('Format: (.*)') - csq_format = format_pattern.search(vcf_reader.header.get_info_field_info('CSQ').description).group(1).split('|') + """ + ) + + +def adding_extra_information(args): + CONTIGS = ["chr{}".format(i) for i in range(1, 23)] + ["chrX", "chrY"] + (vcf_reader, rna_vcf_reader, is_multi_sample) = create_vcf_reader(args) + format_pattern = re.compile("Format: (.*)") + csq_format = ( + format_pattern.search(vcf_reader.header.get_info_field_info("CSQ").description) + .group(1) + .split("|") + ) vcf_writer = create_vcf_writer(args, vcf_reader) - (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer) + (df, id_column, expression_column) = parse_expression_file( + args, vcf_reader, vcf_writer + ) missing_expressions_count = 0 entry_count = 0 for entry in vcf_reader: - #Add expression data + # Add expression data transcript_ids = set() genes = set() - if 'CSQ' not in entry.INFO: - logging.warning("Variant is missing VEP annotation. INFO column doesn't contain CSQ field for variant {}".format(entry)) + if "CSQ" not in entry.INFO: + logging.warning( + "Variant is missing VEP annotation. INFO column doesn't contain CSQ field for variant {}".format( + entry + ) + ) vcf_writer.write_record(entry) continue - for transcript in entry.INFO['CSQ']: - for key, value in zip(csq_format, transcript.split('|')): - if key == 'Feature' and value != '' and not value.startswith('ENSR'): + for transcript in entry.INFO["CSQ"]: + for key, value in zip(csq_format, transcript.split("|")): + if key == "Feature" and value != "" and not value.startswith("ENSR"): transcript_ids.add(value) - if key == 'Gene' and value != '': + if key == "Gene" and value != "": genes.add(value) - if args.mode == 'gene': + if args.mode == "gene": genes = list(genes) if len(genes) > 0: - (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, genes, 'GX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count) - elif args.mode == 'transcript': + (entry, missing_expressions_count, entry_count) = add_expressions( + entry, + is_multi_sample, + args.sample_name, + df, + genes, + "GX", + id_column, + expression_column, + args.ignore_ensembl_id_version, + missing_expressions_count, + entry_count, + ) + elif args.mode == "transcript": transcript_ids = list(transcript_ids) if len(transcript_ids) > 0: - (entry, missing_expressions_count, entry_count) = add_expressions(entry, is_multi_sample, args.sample_name, df, transcript_ids, 'TX', id_column, expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count) - #Add RAD and ROT - if rna_vcf_reader is not None: + (entry, missing_expressions_count, entry_count) = add_expressions( + entry, + is_multi_sample, + args.sample_name, + df, + transcript_ids, + "TX", + id_column, + expression_column, + args.ignore_ensembl_id_version, + missing_expressions_count, + entry_count, + ) + # Add RAD and ROT + if rna_vcf_reader is not None: if (entry.CHROM in CONTIGS) and (entry.is_snv()): - entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name,rna_vcf_reader) + entry = add_AD_rna_file( + entry, is_multi_sample, args.sample_name, rna_vcf_reader + ) vcf_writer.write_record(entry) vcf_reader.close() vcf_writer.close() if missing_expressions_count > 0: - logging.warning("{} of {} {}s did not have an expression entry for their {} id.".format(missing_expressions_count, entry_count, args.mode, args.mode)) + logging.warning( + "{} of {} {}s did not have an expression entry for their {} id.".format( + missing_expressions_count, entry_count, args.mode, args.mode + ) + ) + + +def main(args_input=sys.argv[1:]): + parser = define_parser() + args = parser.parse_args(args_input) + args_check(args) + adding_extra_information(args) + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index 1dd23a267..760b63bd6 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -12,14 +12,14 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step] -exp_format = config['preparation']['format'] +exp_format = config["preparation"]["format"] preparation_vcf = os.path.join( os.path.dirname(__file__), "comb_rna.py", ) ensemble_id = ( "--ignore-ensembl-id-version" - if config['preparation']['ignore-ensembl-id-version'] + if config["preparation"]["ignore-ensembl-id-version"] else "" ) From 4e08a50648efc4c6b9dff949cd01ac79c4189f92 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 4 Apr 2024 13:25:02 +0200 Subject: [PATCH 03/23] Satisfying black --- .../somatic_neoepitope_prediction/__init__.py | 56 ++++++------------- .../wrappers/pvactools/combining/comb_rna.py | 28 +++------- .../wrappers/pvactools/combining/wrapper.py | 4 +- snappy_wrappers/wrappers/stringtie/wrapper.py | 2 +- ...workflows_somatic_neoepitope_prediction.py | 41 ++++++++------ 5 files changed, 50 insertions(+), 81 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index 6366c5577..af5776df3 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -145,9 +145,7 @@ def get_input_files(self, action): library_name=rna_library, ) ext = {"expression", "bam", "bai"} - yield "expression", ngs_mapping( - rna_tpl + ".GeneCounts.tab" - ) + yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab") yield "bam", ngs_mapping(rna_tpl + ".bam") yield "bai", ngs_mapping(rna_tpl + ".bam.bai") @@ -158,9 +156,7 @@ def get_output_files(self, action): # Need to add step for adding RAD also self._validate_action(action) if ( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["mode"] + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene" ): prefix = ( @@ -168,9 +164,7 @@ def get_output_files(self, action): "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) elif ( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["mode"] + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript" ): prefix = ( @@ -188,9 +182,7 @@ def _get_log_file(self, action): # Validate action self._validate_action(action) if ( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["mode"] + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene" ): prefix = ( @@ -198,9 +190,7 @@ def _get_log_file(self, action): "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) elif ( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["mode"] + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript" ): prefix = ( @@ -237,16 +227,12 @@ def get_result_files(self): ] ) if ( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["mode"] + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "gene" ): name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}" elif ( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["mode"] + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] == "transcript" ): name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}" @@ -309,9 +295,7 @@ class SomaticNeoepitopePredictionWorkflow(BaseStep): name = "neoepitope_prediction" sheet_shortcut_class = CancerCaseSheet sheet_shortcut_kwargs = { - "options": CancerCaseSheetOptions( - allow_missing_normal=True, allow_missing_tumor=True - ) + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) } @classmethod @@ -342,14 +326,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) self.register_sub_workflow( "ngs_mapping", - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "path_rna_ngs_mapping" - ], + self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"], ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_ngs_mapping" - ]: + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]: self.w_config["step_config"]["somatic_neoepitope_prediction"][ "tools_ngs_mapping" ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] @@ -365,18 +345,16 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) self.w_config["step_config"]["somatic_neoepitope_prediction"][ "tools_somatic_variant_annotation" ] = ["vep"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_rna_mapping" - ]: + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]: self.w_config["step_config"]["somatic_neoepitope_prediction"][ "tools_rna_mapping" ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["path_features"]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "preparation" - ]["path_features"] = self.w_config["static_data_config"]["features"]["path"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ + "path_features" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ + "path_features" + ] = self.w_config["static_data_config"]["features"]["path"] def get_result_files(self): for sub_step in self.sub_steps.values(): diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py index ef40f0151..9b3a6956e 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -299,9 +299,7 @@ def parse_expression_file(args, vcf_reader, vcf_writer): def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf): RAD_temp = [] ROT = [] - for rna_record in rna_vcf.fetch( - entry.CHROM, entry.affected_start, entry.affected_end - ): + for rna_record in rna_vcf.fetch(entry.CHROM, entry.affected_start, entry.affected_end): rna_alt = [alt.value for alt in rna_record.ALT] rna_AD = [c.data.get("AD") for c in rna_record.calls][0] ROT_temp = sum(rna_AD) - rna_AD[0] @@ -344,9 +342,7 @@ def add_expressions( for item in items: entry_count += 1 if ignore_ensembl_id_version: - subset = df.loc[ - df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item) - ] + subset = df.loc[df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item)] else: subset = df.loc[df[id_column] == item] if len(subset) > 0: @@ -373,12 +369,8 @@ def define_parser(): ) parser.add_argument("--input_vcf", help="A VEP-annotated VCF file") - parser.add_argument( - "--expression_file", help="A TSV file containing expression estimates" - ) - parser.add_argument( - "--genecode", help="A genecode file for calculate TPM from star gene count" - ) + parser.add_argument("--expression_file", help="A TSV file containing expression estimates") + parser.add_argument("--genecode", help="A genecode file for calculate TPM from star gene count") parser.add_argument( "--rna-vcf", help="A VCF file of RNA-seq data", @@ -437,9 +429,7 @@ def args_check(args): ) if args.format == "star": if args.genecode is None: - raise Exception( - "--genecode is not set. This is required when using the `star` format" - ) + raise Exception("--genecode is not set. This is required when using the `star` format") exp_col = ["unstranded", "rf", "fr"] if args.expression_column not in exp_col: raise Exception( @@ -463,9 +453,7 @@ def adding_extra_information(args): ) vcf_writer = create_vcf_writer(args, vcf_reader) - (df, id_column, expression_column) = parse_expression_file( - args, vcf_reader, vcf_writer - ) + (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer) missing_expressions_count = 0 entry_count = 0 for entry in vcf_reader: @@ -522,9 +510,7 @@ def adding_extra_information(args): # Add RAD and ROT if rna_vcf_reader is not None: if (entry.CHROM in CONTIGS) and (entry.is_snv()): - entry = add_AD_rna_file( - entry, is_multi_sample, args.sample_name, rna_vcf_reader - ) + entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name, rna_vcf_reader) vcf_writer.write_record(entry) vcf_reader.close() diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index 760b63bd6..9fbd6be11 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -18,9 +18,7 @@ "comb_rna.py", ) ensemble_id = ( - "--ignore-ensembl-id-version" - if config["preparation"]["ignore-ensembl-id-version"] - else "" + "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else "" ) shell( diff --git a/snappy_wrappers/wrappers/stringtie/wrapper.py b/snappy_wrappers/wrappers/stringtie/wrapper.py index f71436713..0ded26e3d 100644 --- a/snappy_wrappers/wrappers/stringtie/wrapper.py +++ b/snappy_wrappers/wrappers/stringtie/wrapper.py @@ -44,4 +44,4 @@ md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5} """ -) \ No newline at end of file +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py index eac6138d7..960aff0e3 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py @@ -83,6 +83,7 @@ def minimal_config(): ).lstrip() ) + @pytest.fixture def somatic_neoepitope_prediction_workflow( dummy_workflow, @@ -110,6 +111,7 @@ def somatic_neoepitope_prediction_workflow( work_dir, ) + def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_prediction_workflow): # Define expected vcf_base_out = ( @@ -129,33 +131,43 @@ def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_pre actual = somatic_neoepitope_prediction_workflow.get_input_files("neoepitope_preparation", "run") assert actual == expected + def test_neoepitope_preparation_step_part_get_output_files(somatic_neoepitope_prediction_workflow): base_out = ( - "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" - "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" - ) + "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) expected = get_expected_output_vcf_files_dict(base_out) - actual = somatic_neoepitope_prediction_workflow.get_output_files("neoepitope_preparation", "run") + actual = somatic_neoepitope_prediction_workflow.get_output_files( + "neoepitope_preparation", "run" + ) assert actual == expected + def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_prediction_workflow): base_out = ( "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" - "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) expected = get_expected_log_files_dict(base_out=base_out) actual = somatic_neoepitope_prediction_workflow.get_log_file("neoepitope_preparation", "run") assert actual == expected -def test_neoepitope_preparation_step_part_get_resource_usage(somatic_neoepitope_prediction_workflow): + +def test_neoepitope_preparation_step_part_get_resource_usage( + somatic_neoepitope_prediction_workflow, +): # Define expected expected_dict = {"threads": 2, "time": "1:00:00", "memory": "6144M"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_neoepitope_prediction_workflow.get_resource("neoepitope_preparation", "run", resource) + actual = somatic_neoepitope_prediction_workflow.get_resource( + "neoepitope_preparation", "run", resource + ) assert actual == expected, msg_error + def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_workflow): """Test simple functionality of the workflow""" # Check created sub steps @@ -178,14 +190,9 @@ def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_wo ext=ext, dir_="out", ) - for i,t in ((1, 1), (2, 2)) - for ext in ( - "vcf.gz", - "vcf.gz.tbi", - "vcf.gz.md5", - "vcf.gz.tbi.md5" - ) - for mapper in("bwa",) + for i, t in ((1, 1), (2, 2)) + for ext in ("vcf.gz", "vcf.gz.tbi", "vcf.gz.md5", "vcf.gz.tbi.md5") + for mapper in ("bwa",) for var_caller in ("mutect2", "scalpel") for anno_caller in ("vep",) ] @@ -200,7 +207,7 @@ def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_wo ext=ext, dir_="log", ) - for i,t in ((1, 1), (2, 2)) + for i, t in ((1, 1), (2, 2)) for ext in ( "conda_info.txt", "conda_list.txt", @@ -209,7 +216,7 @@ def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_wo "conda_list.txt.md5", "log.md5", ) - for mapper in("bwa",) + for mapper in ("bwa",) for var_caller in ("mutect2", "scalpel") for anno_caller in ("vep",) ] From 0c35c665bd2de1eea2b43da960e124dc18d5f168 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 4 Apr 2024 13:28:35 +0200 Subject: [PATCH 04/23] Satisfying snakefmt --- .../workflows/somatic_neoepitope_prediction/Snakefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile index 0fcffcce3..7fbb4bf6d 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -21,9 +21,7 @@ configfile: "config.yaml" config, lookup_paths, config_paths = expand_ref("config.yaml", config) # WorkflowImpl Object Setup =================================================== -wf = SomaticNeoepitopePredictionWorkflow( - workflow, config, lookup_paths, config_paths, os.getcwd() -) +wf = SomaticNeoepitopePredictionWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) localrules: From b4b36ba5678bd019bbff4cf5895855fe9d909721 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 4 Apr 2024 13:33:09 +0200 Subject: [PATCH 05/23] make isort comfort --- .../somatic_neoepitope_prediction/__init__.py | 16 +++------------- .../wrappers/pvactools/combining/comb_rna.py | 9 +++++---- .../wrappers/pvactools/combining/wrapper.py | 1 + 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index af5776df3..898ce1c55 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -35,27 +35,17 @@ import os import sys -from biomedsheets.shortcuts import ( - CancerCaseSheet, - CancerCaseSheetOptions, - is_not_background, -) +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart -from snappy_pipeline.workflows.ngs_mapping import ( - NgsMappingWorkflow, - ResourceUsage, -) +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from snappy_pipeline.workflows.somatic_variant_annotation import SomaticVariantAnnotationWorkflow from snappy_pipeline.workflows.somatic_variant_calling import ( SOMATIC_VARIANT_CALLERS_MATCHED, SomaticVariantCallingWorkflow, ) -from snappy_pipeline.workflows.somatic_variant_annotation import ( - SomaticVariantAnnotationWorkflow, -) - __author__ = "Pham Gia Cuong" __email__ = "pham.gia-cuong@bih-charite.de" diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py index 9b3a6956e..8169f2366 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -1,10 +1,11 @@ import argparse +import logging +import re import sys -import vcfpy -import pyranges as pr + import pandas as pd -import re -import logging +import pyranges as pr +import vcfpy ########################## Reading vcf files ########################## diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index 9fbd6be11..b418ca3c8 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -4,6 +4,7 @@ """ import os + from snakemake.shell import shell __author__ = "Pham Gia Cuong" From f1af42ce56e4b2520b84fb9584d42181c16e788a Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 22 May 2024 13:08:28 +0200 Subject: [PATCH 06/23] update neoepitope prediction --- .../somatic_neoepitope_prediction/__init__.py | 143 +++++++++--------- .../wrappers/pvactools/combining/comb_rna.py | 92 ++++++----- .../wrappers/pvactools/combining/wrapper.py | 4 + 3 files changed, 117 insertions(+), 122 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index 898ce1c55..a397b2401 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -204,6 +204,74 @@ def get_resource_usage(self, action): memory=f"{mem_mb}M", ) + +class SomaticNeoepitopePredictionWorkflow(BaseStep): + """Perform neoepitope prediction workflow""" + + name = "neoepitope_prediction" + sheet_shortcut_class = CancerCaseSheet + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one.""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + ( + SomaticVariantCallingWorkflow, + SomaticVariantAnnotationWorkflow, + NgsMappingWorkflow, + ), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart)) + self.register_sub_workflow( + "somatic_variant_annotation", + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "path_somatic_variant_annotation" + ], + ) + self.register_sub_workflow( + "ngs_mapping", + self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"], + ) + # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_ngs_mapping" + ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_calling" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_calling" + ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_annotation" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_somatic_variant_annotation" + ] = ["vep"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]: + self.w_config["step_config"]["somatic_neoepitope_prediction"][ + "tools_rna_mapping" + ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"] + if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ + "path_features" + ]: + self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ + "path_features" + ] = self.w_config["static_data_config"]["features"]["path"] + @listify def get_result_files(self): callers = set( @@ -259,7 +327,7 @@ def _yield_result_files_matched(self, tpl, **kwargs): This function returns the results from the matched somatic variant callers such as Mutect. """ - for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sheet in filter(is_not_background, self.shortcut_sheets): for sample_pair in sheet.all_sample_pairs: if ( not sample_pair.tumor_sample.dna_ngs_library @@ -278,79 +346,6 @@ def _yield_result_files_matched(self, tpl, **kwargs): **kwargs, ) - -class SomaticNeoepitopePredictionWorkflow(BaseStep): - """Perform neoepitope prediction workflow""" - - name = "neoepitope_prediction" - sheet_shortcut_class = CancerCaseSheet - sheet_shortcut_kwargs = { - "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) - } - - @classmethod - def default_config_yaml(cls): - """Return default config YAML, to be overwritten by project-specific one.""" - return DEFAULT_CONFIG - - def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): - super().__init__( - workflow, - config, - config_lookup_paths, - config_paths, - workdir, - ( - SomaticVariantCallingWorkflow, - SomaticVariantAnnotationWorkflow, - NgsMappingWorkflow, - ), - ) - # Register sub step classes so the sub steps are available - self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart)) - self.register_sub_workflow( - "somatic_variant_annotation", - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "path_somatic_variant_annotation" - ], - ) - self.register_sub_workflow( - "ngs_mapping", - self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"], - ) - # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_ngs_mapping" - ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_calling" - ]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_calling" - ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_annotation" - ]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_annotation" - ] = ["vep"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_rna_mapping" - ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ - "path_features" - ]: - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ - "path_features" - ] = self.w_config["static_data_config"]["features"]["path"] - - def get_result_files(self): - for sub_step in self.sub_steps.values(): - if sub_step.name not in (LinkOutStepPart.name,): - yield from sub_step.get_result_files() - def check_config(self): """Check that the path to the NGS mapping is present""" self.ensure_w_config( diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py index 8169f2366..8a2d596ff 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -67,7 +67,7 @@ def create_vcf_writer(args, vcf_reader): [ ("ID", "GX"), ("Number", "."), - ("Type", "String"), + ("Type", "Float"), ("Description", "Gene Expressions"), ] ) @@ -79,7 +79,7 @@ def create_vcf_writer(args, vcf_reader): [ ("ID", "TX"), ("Number", "."), - ("Type", "String"), + ("Type", "Float"), ("Description", "Transcript Expressions"), ] ) @@ -91,7 +91,7 @@ def create_vcf_writer(args, vcf_reader): [ ("ID", "RAD"), ("Number", "."), - ("Type", "String"), + ("Type", "Integer"), ( "Description", "Allelic depths from mRNA sequencing data of the same sample", @@ -104,7 +104,7 @@ def create_vcf_writer(args, vcf_reader): [ ("ID", "ROT"), ("Number", "."), - ("Type", "String"), + ("Type", "Integer"), ( "Description", "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample", @@ -154,12 +154,8 @@ def resolve_expression_column(args): elif args.format == "stringtie": return "TPM" elif args.format == "star": - if args.expression_column is None: - raise Exception( - "ERROR: `--expression-column` option is required when using the `star` format" - ) - else: - return "TPM_" + args.expression_column + args.expression_column = "read_count" + return "TPM_" + args.expression_column elif args.format == "custom": if args.expression_column is None: raise Exception( @@ -184,7 +180,7 @@ def resolve_stringtie_id_column(args, headers): def genecount_reader(genecount_path): - col_name = ["gene_id", "unstranded", "rf", "fr"] + col_name = ["gene_id", "read_count"] star_genecount = pd.read_csv( genecount_path, skiprows=4, @@ -193,14 +189,15 @@ def genecount_reader(genecount_path): header=None, ).sort_values(by=["gene_id"]) # star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0]) - star_genecount.unstranded = star_genecount.unstranded.astype(int, copy=None) - star_genecount.rf = star_genecount.rf.astype(int, copy=None) - star_genecount.fr = star_genecount.fr.astype(int, copy=None) + star_genecount.read_count = star_genecount.read_count.astype(int, copy=None) + # star_genecount.rf = star_genecount.rf.astype(int, copy=None) + # star_genecount.fr = star_genecount.fr.astype(int, copy=None) # star_genecount.set_index("gene_id") return star_genecount -def genecode_reader(genecode_path, star_genecount, expression_column): +def genecode_reader(genecode_path, star_genecount): + expression_column = "read_count" gc = pr.read_gtf(genecode_path, as_df=True) gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])] gc = gc[(gc.Feature == "gene")] @@ -269,7 +266,7 @@ def parse_expression_file(args, vcf_reader, vcf_writer): elif args.format == "star": id_column = resolve_id_column(args) star_genecount = genecount_reader(args.expression_file) - df = genecode_reader(args.genecode, star_genecount, args.expression_column) + df = genecode_reader(args.genecode, star_genecount) else: id_column = resolve_id_column(args) df = pd.read_csv(args.expression_file, sep="\t") @@ -294,19 +291,36 @@ def parse_expression_file(args, vcf_reader, vcf_writer): id_column, args.expression_file ) ) - return df, id_column, expression_column + if (args.ignore_ensembl_id_version): + df[id_column] = df[id_column].apply( + lambda x: re.sub(r"\.[0-9]+$", "", x) + ) + exp_df = df[[id_column, expression_column]] + exp_dict = exp_df.set_index(id_column)[expression_column].to_dict() + return exp_dict def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf): RAD_temp = [] ROT = [] - for rna_record in rna_vcf.fetch(entry.CHROM, entry.affected_start, entry.affected_end): + try: + rna_records = rna_vcf.fetch(entry.CHROM, entry.affected_start, entry.affected_end) + rna_record = next(rna_records) + except (StopIteration, ValueError): + # if there is no snp. RAD and RoT will be set to 0 + if is_multi_sample: + entry.FORMAT += ["RAD"] + entry.call_for_sample[sample_name].data["RAD"] = 0 + entry.FORMAT += ["ROT"] + entry.call_for_sample[sample_name].data["ROT"] = 0 + else: + entry.add_format("RAD", 0) + entry.add_format("ROT", 0) + else: rna_alt = [alt.value for alt in rna_record.ALT] rna_AD = [c.data.get("AD") for c in rna_record.calls][0] ROT_temp = sum(rna_AD) - rna_AD[0] - if not rna_alt: - continue - else: + if rna_alt: rna_AD = [c.data.get("AD") for c in rna_record.calls][0] for dna_alt in entry.ALT: if dna_alt.value in rna_alt: @@ -330,11 +344,9 @@ def add_expressions( entry, is_multi_sample, sample_name, - df, + exp_dict, items, tag, - id_column, - expression_column, ignore_ensembl_id_version, missing_expressions_count, entry_count, @@ -343,11 +355,9 @@ def add_expressions( for item in items: entry_count += 1 if ignore_ensembl_id_version: - subset = df.loc[df["transcript_without_version"] == re.sub(r"\.[0-9]+$", "", item)] - else: - subset = df.loc[df[id_column] == item] - if len(subset) > 0: - expressions[item] = subset[expression_column].sum() + item = re.sub(r"\.[0-9]+$", "", item) + if item in exp_dict: + expressions[item] = exp_dict[item] else: missing_expressions_count += 1 if is_multi_sample: @@ -431,21 +441,12 @@ def args_check(args): if args.format == "star": if args.genecode is None: raise Exception("--genecode is not set. This is required when using the `star` format") - exp_col = ["unstranded", "rf", "fr"] - if args.expression_column not in exp_col: - raise Exception( - """ - --expression-column is not found. Please choose : - unstranded - Assumes a unstranded library. - rf - Assumes a stranded library fr-firststrand. - fr - Assumes a stranded library fr-secondstrand. - """ - ) def adding_extra_information(args): - CONTIGS = ["chr{}".format(i) for i in range(1, 23)] + ["chrX", "chrY"] (vcf_reader, rna_vcf_reader, is_multi_sample) = create_vcf_reader(args) + # Only get contigs from vcf input file + CONTIGS = [x.id for x in vcf_reader.header.get_lines("contig")] format_pattern = re.compile("Format: (.*)") csq_format = ( format_pattern.search(vcf_reader.header.get_info_field_info("CSQ").description) @@ -454,7 +455,7 @@ def adding_extra_information(args): ) vcf_writer = create_vcf_writer(args, vcf_reader) - (df, id_column, expression_column) = parse_expression_file(args, vcf_reader, vcf_writer) + exp_dict = parse_expression_file(args, vcf_reader, vcf_writer) missing_expressions_count = 0 entry_count = 0 for entry in vcf_reader: @@ -483,11 +484,9 @@ def adding_extra_information(args): entry, is_multi_sample, args.sample_name, - df, + exp_dict, genes, "GX", - id_column, - expression_column, args.ignore_ensembl_id_version, missing_expressions_count, entry_count, @@ -499,12 +498,9 @@ def adding_extra_information(args): entry, is_multi_sample, args.sample_name, - df, + exp_dict, transcript_ids, "TX", - id_column, - expression_column, - args.ignore_ensembl_id_version, missing_expressions_count, entry_count, ) diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index b418ca3c8..aaa917a33 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -21,6 +21,9 @@ ensemble_id = ( "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else "" ) +id_column = ( + "--id-column " + str(config["preparation"]["id-column"]) if config["preparation"]["format"] == "custom" else "" +) shell( r""" @@ -73,6 +76,7 @@ -e {config[preparation][expression-column]} \ -s {snakemake.wildcards.tumor_library} \ -o /dev/stdout \ + "{id_column}" \ "{ensemble_id}" \ | bgzip -c > {snakemake.output.vcf} tabix -f {snakemake.output.vcf} From 2fcf1bfca70d28f5b98bdda0a7a156028f1c6f2d Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 22 May 2024 13:28:08 +0200 Subject: [PATCH 07/23] Make black satisfy --- snappy_wrappers/wrappers/pvactools/combining/comb_rna.py | 6 ++---- snappy_wrappers/wrappers/pvactools/combining/wrapper.py | 4 +++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py index 8a2d596ff..b28d3eedc 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -291,10 +291,8 @@ def parse_expression_file(args, vcf_reader, vcf_writer): id_column, args.expression_file ) ) - if (args.ignore_ensembl_id_version): - df[id_column] = df[id_column].apply( - lambda x: re.sub(r"\.[0-9]+$", "", x) - ) + if args.ignore_ensembl_id_version: + df[id_column] = df[id_column].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) exp_df = df[[id_column, expression_column]] exp_dict = exp_df.set_index(id_column)[expression_column].to_dict() return exp_dict diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index aaa917a33..3d698637f 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -22,7 +22,9 @@ "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else "" ) id_column = ( - "--id-column " + str(config["preparation"]["id-column"]) if config["preparation"]["format"] == "custom" else "" + "--id-column " + str(config["preparation"]["id-column"]) + if config["preparation"]["format"] == "custom" + else "" ) shell( From cfa36acbfaa21a3080ddfd9bfbf9912ecd221353 Mon Sep 17 00:00:00 2001 From: CuongPham <50145360+giacuong171@users.noreply.github.com> Date: Tue, 4 Jun 2024 15:18:59 +0200 Subject: [PATCH 08/23] Update snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile Co-authored-by: Till Hartmann --- .../workflows/somatic_neoepitope_prediction/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile index 7fbb4bf6d..975acb83c 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""CUBI Pipeline tumor mutational burden step Snakefile""" +"""CUBI Pipeline somatic neoepitope prediction step Snakefile""" import os From 8d9155d4a66deee70a4b5815e0ccf62e7dbe3bf4 Mon Sep 17 00:00:00 2001 From: CuongPham <50145360+giacuong171@users.noreply.github.com> Date: Tue, 4 Jun 2024 15:19:18 +0200 Subject: [PATCH 09/23] Update snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py Co-authored-by: Till Hartmann --- .../workflows/somatic_neoepitope_prediction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index a397b2401..5048012a9 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -347,7 +347,7 @@ def _yield_result_files_matched(self, tpl, **kwargs): ) def check_config(self): - """Check that the path to the NGS mapping is present""" + """Check that path_somatic_variant_annotation, preparation/mode and preparation/format are present in the configuration""" self.ensure_w_config( ( "step_config", From 40cb936e2fac9854cc2cd830fdc474307d228c9c Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Tue, 18 Jun 2024 22:04:20 +0200 Subject: [PATCH 10/23] Update preparation for somatic neoepitope prediction --- .../somatic_neoepitope_prediction/Snakefile | 4 +- .../somatic_neoepitope_prediction/__init__.py | 184 +++-- .../wrappers/pvactools/combining/comb_rna.py | 645 +++++++++--------- .../pvactools/combining/environment.yaml | 12 +- .../wrappers/pvactools/combining/wrapper.py | 61 +- 5 files changed, 428 insertions(+), 478 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile index 7fbb4bf6d..326510b4e 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""CUBI Pipeline tumor mutational burden step Snakefile""" +"""CUBI Pipeline somatic neoepitope prediction step Snakefile""" import os @@ -59,5 +59,7 @@ rule neoepitope_preparation: memory=wf.get_resource("neoepitope_preparation", "run", "memory"), partition=wf.get_resource("neoepitope_preparation", "run", "partition"), tmpdir=wf.get_resource("neoepitope_preparation", "run", "tmpdir"), + params: + **{"args": wf.get_params("neoepitope_preparation", "run")}, wrapper: wf.wrapper_path("pvactools/combining") diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index a397b2401..a252eb961 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -59,24 +59,22 @@ somatic_neoepitope_prediction: path_somatic_variant_annotation: ../somatic_variant_annotation # REQUIRED path_rna_ngs_mapping: ../ngs_mapping - tools_somatic_variant_annotation: [] + tools_somatic_variant_annotation: ["vep"] tools_rna_mapping: [] # deafult to those configured for ngs_mapping tools_ngs_mapping: [] # deafult to those configured for ngs_mapping tools_somatic_variant_calling: [] # deafult to those configured for somatic_variant_calling - max_depth: "4000" preparation: - format: 'star' # REQUIRED - The file format of the expression file to process. (stringtie,kallisto,cufflinks,custom) + format: 'snappy_custom' # REQUIRED - The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom) # Use `custom` to process file formats not explicitly supported. # The `custom` option requires the use of the --id-column and --expression-column arguments. - path_features: '' - mode: 'gene' # REQUIRED - id-column: '' # The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format. - expression-column: 'fr' # REQUIRED - # The column header in the expression_file for - # the column containing expression data. - # Required when using the `custom` and `star` format. For `star`, there are - # only 3 options [unstranded,rf,fr] - ignore-ensembl-id-version: True #Ignore the ensemble id version + expression_file: '' # expression file path. If the format is not snappy_custom + path_features: '' # Gencode file path, required for star and snappy format + mode: 'gene' # REQUIRED - Determine whether the expression file contains gene or transcript TPM values. + full_vep_annotation: True # The somatic_variant_annotation has been done in fully annotation mode + id_column: # Gene/Transcript id column name. Need when using the `custom` format. + expression_column: # Expression column name. Need when using the `custom` format. + ignore_ensembl_id_version: True #Ignore the ensemble id version + max_depth: 4000 """ @@ -113,50 +111,46 @@ def get_input_files(self, action): "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" ) - # Need to change for work on many different tools - key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"} + if self.config["preparation"]["full_vep_annotation"]: + key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"} + else: + key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"] ngs_mapping = self.parent.sub_workflows["ngs_mapping"] for key, ext in key_ext.items(): yield key, variant_annotation(tpl + ext) # Getting appropriate rna library - for sheet in filter(is_not_background, self.parent.sheets): - for donor in sheet.bio_entities.values(): - for biosample in donor.bio_samples.values(): - if biosample.extra_infos["isTumor"]: - for test_sample in biosample.test_samples.values(): - if test_sample.extra_infos["extractionType"] == "RNA": - for lib in test_sample.ngs_libraries.values(): - rna_library = lib.name - rna_tpl = ( - "output/{mapper}.{library_name}/out/{mapper}.{library_name}" - ).format( - mapper="star", - library_name=rna_library, - ) - ext = {"expression", "bam", "bai"} - yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab") - yield "bam", ngs_mapping(rna_tpl + ".bam") - yield "bai", ngs_mapping(rna_tpl + ".bam.bai") + if self.config["preparation"]["format"] == "snappy_custom": + for sheet in filter(is_not_background, self.parent.sheets): + for donor in sheet.bio_entities.values(): + for biosample in donor.bio_samples.values(): + if biosample.extra_infos["isTumor"]: + for test_sample in biosample.test_samples.values(): + if test_sample.extra_infos["extractionType"] == "RNA": + for lib in test_sample.ngs_libraries.values(): + rna_library = lib.name + rna_tpl = ( + "output/{mapper}.{library_name}/out/{mapper}.{library_name}" + ).format( + mapper="star", + library_name=rna_library, + ) + ext = {"expression", "bam", "bai"} + yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab") + yield "bam", ngs_mapping(rna_tpl + ".bam") + yield "bai", ngs_mapping(rna_tpl + ".bam.bai") @dictify def get_output_files(self, action): """Return output files""" # Validate action - # Need to add step for adding RAD also self._validate_action(action) - if ( - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] - == "gene" - ): + if self.config["preparation"]["mode"] == "gene": prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) - elif ( - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] - == "transcript" - ): + elif self.config["preparation"]["mode"] == "transcript": prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" @@ -171,18 +165,12 @@ def _get_log_file(self, action): """Return mapping of log files.""" # Validate action self._validate_action(action) - if ( - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] - == "gene" - ): + if self.config["preparation"]["mode"] == "gene": prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) - elif ( - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] - == "transcript" - ): + elif self.config["preparation"]["mode"] == "transcript": prefix = ( "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" @@ -204,11 +192,22 @@ def get_resource_usage(self, action): memory=f"{mem_mb}M", ) + def get_params(self, action): + self._validate_action(action) + return { + "max_depth": self.config["preparation"]["max_depth"], + "format": self.config["preparation"]["format"], + "expression_column": self.config["preparation"]["expression_column"], + "id_column": self.config["preparation"]["id_column"], + "ignore_ensembl_id_version": self.config["preparation"]["ignore_ensembl_id_version"], + "mode": self.config["preparation"]["mode"], + } + class SomaticNeoepitopePredictionWorkflow(BaseStep): """Perform neoepitope prediction workflow""" - name = "neoepitope_prediction" + name = "somatic_neoepitope_prediction" sheet_shortcut_class = CancerCaseSheet sheet_shortcut_kwargs = { "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) @@ -236,79 +235,50 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart)) self.register_sub_workflow( "somatic_variant_annotation", - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "path_somatic_variant_annotation" - ], + self.config["path_somatic_variant_annotation"], ) self.register_sub_workflow( "ngs_mapping", - self.w_config["step_config"]["somatic_neoepitope_prediction"]["path_rna_ngs_mapping"], + self.config["path_rna_ngs_mapping"], ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_ngs_mapping"]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_ngs_mapping" - ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_calling" - ]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_calling" - ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_annotation" - ]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_annotation" - ] = ["vep"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["tools_rna_mapping"]: - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_rna_mapping" - ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"] - if not self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ - "path_features" - ]: - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"][ - "path_features" - ] = self.w_config["static_data_config"]["features"]["path"] + if not self.config["tools_ngs_mapping"]: + self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ + "dna" + ] + if not self.config["tools_somatic_variant_calling"]: + self.config["tools_somatic_variant_calling"] = self.w_config["step_config"][ + "somatic_variant_calling" + ]["tools"] + if not self.config["tools_somatic_variant_annotation"]: + self.config["tools_somatic_variant_annotation"] = ["vep"] + if not self.config["tools_rna_mapping"]: + self.config["tools_rna_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ + "rna" + ] + if not self.config["preparation"]["path_features"]: + self.config["preparation"]["path_features"] = self.w_config["static_data_config"][ + "features" + ]["path"] @listify def get_result_files(self): - callers = set( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_calling" - ] - ) - anno_callers = set( - self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_somatic_variant_annotation" - ] - ) - if ( - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] - == "gene" - ): + callers = set(self.config["tools_somatic_variant_calling"]) + anno_callers = set(self.config["tools_somatic_variant_annotation"]) + if self.config["preparation"]["mode"] == "gene": name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}" - elif ( - self.w_config["step_config"]["somatic_neoepitope_prediction"]["preparation"]["mode"] - == "transcript" - ): + elif self.config["preparation"]["mode"] == "transcript": name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}" - yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_ngs_mapping" - ], + mapper=self.config["tools_ngs_mapping"], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, ext=EXT_VALUES, ) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["somatic_neoepitope_prediction"][ - "tools_ngs_mapping" - ], + mapper=self.config["tools_ngs_mapping"], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, ext=( @@ -347,7 +317,7 @@ def _yield_result_files_matched(self, tpl, **kwargs): ) def check_config(self): - """Check that the path to the NGS mapping is present""" + """Check that path_somatic_variant_annotation, preparation/mode and preparation/format are present in the configuration""" self.ensure_w_config( ( "step_config", @@ -364,5 +334,5 @@ def check_config(self): self.ensure_w_config( ("step_config", "somatic_neoepitope_prediction", "preparation", "format"), - "Format is required for adding ", + "Format is required to determine which tool generated the TPM values. ", ) diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py index b28d3eedc..b8017b5d3 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -5,225 +5,148 @@ import pandas as pd import pyranges as pr -import vcfpy +from vcfpy import OrderedDict, Reader, Writer -########################## Reading vcf files ########################## -def create_vcf_reader(args): - vcf_reader = vcfpy.Reader.from_path(args.input_vcf) - if args.rna_vcf: - rna_vcf_reader = vcfpy.Reader.from_path(args.rna_vcf) - else: - rna_vcf_reader = None - # expression part - is_multi_sample = len(vcf_reader.header.samples.names) > 1 - if is_multi_sample and args.sample_name is None: - vcf_reader.close() - raise Exception( - "ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format( - args.input_vcf - ) - ) - elif is_multi_sample and args.sample_name not in vcf_reader.header.samples.names: - vcf_reader.close() - raise Exception( - "ERROR: VCF {} does not contain a sample column for sample {}.".format( - args.input_vcf, args.sample_name - ) - ) - if "CSQ" not in vcf_reader.header.info_ids(): - vcf_reader.close() - raise Exception( - "ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format( - args.input_vcf - ) - ) - if args.mode == "gene" and "GX" in vcf_reader.header.format_ids(): - vcf_reader.close() - raise Exception( - "ERROR: VCF {} is already gene expression annotated. GX format header already exists.".format( - args.input_vcf - ) - ) - elif args.mode == "transcript" and "TX" in vcf_reader.header.format_ids(): - vcf_reader.close() - raise Exception( - "ERROR: VCF {} is already transcript expression annotated. TX format header already exists.".format( - args.input_vcf - ) - ) - # snv part - - return vcf_reader, rna_vcf_reader, is_multi_sample +########################## Define tool parameters ########################## +def define_parser(): + parser = argparse.ArgumentParser( + "comb_rna.py", + description="A tool that will add the data from several expression tools' output files" + + "and allelic depths from mRNA sequencing data for snvs" + + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star, snappy_custom" + + "and Cufflinks. There also is a ``custom`` option to annotate with data " + + "from any tab-delimited file.", + ) + parser.add_argument("--input-vcf", help="A VEP-annotated VCF file") + parser.add_argument("--expression-file", help="A expression file") + parser.add_argument("--genecode", help="A genecode file for calculate TPM from star gene count") + parser.add_argument( + "--rna-vcf", + help="A VCF file of RNA-seq data", + ) + parser.add_argument( + "--format", + choices=["kallisto", "stringtie", "cufflinks", "star", "snappy_custom", "custom"], + help="The file format of the expression file to process. " + + "Use `custom` to process file formats not explicitly supported. " + + "The `custom` option requires the use of the --id-column and --expression-column arguments.", + ) + parser.add_argument( + "--mode", + choices=["gene", "transcript"], + help="The type of expression data in the expression_file", + ) + parser.add_argument( + "-i", + "--id-column", + help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.", + ) + parser.add_argument( + "-e", + "--expression-column", + help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format.", + ) + parser.add_argument( + "-s", + "--sample-name", + help="If the input_vcf contains multiple samples, the name of the sample to annotate.", + ) + parser.add_argument( + "-o", + "--output-vcf", + help="Path to write the output VCF file. If not provided, the output VCF file will be " + + "written next to the input VCF file with a .tx.vcf or .gx.vcf file ending.", + ) + parser.add_argument( + "--ignore-ensembl-id-version", + help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").', + action="store_true", + ) -########################## Adding extra fields into Format ########################## -def create_vcf_writer(args, vcf_reader): - (head, sep, tail) = args.input_vcf.rpartition(".vcf") - new_header = vcf_reader.header.copy() - if args.mode == "gene": - new_header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "GX"), - ("Number", "."), - ("Type", "Float"), - ("Description", "Gene Expressions"), - ] - ) - ) - output_file = ("").join([head, ".gx.vcf", tail]) - elif args.mode == "transcript": - new_header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "TX"), - ("Number", "."), - ("Type", "Float"), - ("Description", "Transcript Expressions"), - ] - ) - ) - output_file = ("").join([head, ".tx.vcf", tail]) - if args.rna_vcf: - new_header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "RAD"), - ("Number", "."), - ("Type", "Integer"), - ( - "Description", - "Allelic depths from mRNA sequencing data of the same sample", - ), - ] - ) - ) - new_header.add_format_line( - vcfpy.OrderedDict( - [ - ("ID", "ROT"), - ("Number", "."), - ("Type", "Integer"), - ( - "Description", - "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample", - ), - ] - ) - ) - (head, sep, tail) = output_file.rpartition(".vcf") - output_file = ("").join([head, ".pileup.vcf", tail]) - if args.output_vcf: - output_file = args.output_vcf - return vcfpy.Writer.from_path(output_file, new_header) + return parser -########################## Auto assign gene id column based on different tools ########################## -def resolve_id_column(args): - if args.format == "cufflinks": - return "tracking_id" - elif args.format == "kallisto": - if args.mode == "gene": - return "gene" - elif args.mode == "transcript": - return "target_id" - elif args.format == "stringtie": - if args.mode == "gene": - return "Gene ID" - elif args.format == "star": - return "gene_id" # NEED change - elif args.format == "custom": +########################## Arguments check ########################## +def args_check(args): + if args.format == "custom": if args.id_column is None: raise Exception( - "ERROR: `--id-column` option is required when using the `custom` format" + "--id-column is not set. This is required when using the `custom` format." ) - else: - return args.id_column - - -########################## Auto assign expression column based on different tools ########################## -def resolve_expression_column(args): - if args.format == "cufflinks": - return "FPKM" - elif args.format == "kallisto": - if args.mode == "gene": - return "abundance" - elif args.mode == "transcript": - return "tpm" - elif args.format == "stringtie": - return "TPM" - elif args.format == "star": - args.expression_column = "read_count" - return "TPM_" + args.expression_column - elif args.format == "custom": if args.expression_column is None: raise Exception( - "ERROR: `--expression-column` option is required when using the `custom` format" + "--expression-column is not set. This is required when using the `custom` format." ) - else: - return args.expression_column - - -def to_array(dictionary): - array = [] - for key, value in dictionary.items(): - array.append("{}|{}".format(key, value)) - return sorted(array) + if args.format == "star": + if args.genecode is None: + raise Exception("--genecode is not set. This is required when using the `star` format") -def resolve_stringtie_id_column(args, headers): - if "gene_id" in headers: - return "gene_id" - else: - return "transcript_id" - - -def genecount_reader(genecount_path): - col_name = ["gene_id", "read_count"] - star_genecount = pd.read_csv( - genecount_path, - skiprows=4, - sep="\t", - names=col_name, - header=None, - ).sort_values(by=["gene_id"]) - # star_genecount["Ensembl_gene_identifier"] = star_genecount.Ensembl_gene_identifier.map(lambda x: x.split(".")[0]) - star_genecount.read_count = star_genecount.read_count.astype(int, copy=None) - # star_genecount.rf = star_genecount.rf.astype(int, copy=None) - # star_genecount.fr = star_genecount.fr.astype(int, copy=None) - # star_genecount.set_index("gene_id") - return star_genecount - - -def genecode_reader(genecode_path, star_genecount): - expression_column = "read_count" +########################## Get expected column names ########################## +def get_expected_column_names(format, mode, id_column, exp_column): + match (format, mode): + case ("star", _): + try: + exp = int(exp_column) + if 1 < exp and exp <= 4: + return [0, exp - 1] + else: + raise Exception("Expression column is invalid") + except ValueError: + raise ValueError("Expression column in star should be a number") + case ("snappy_custom", _): + return [0, 1] + case ("kallisto", "gene"): + return ["gene", "abundance"] + case ("kallisto", "transcript"): + return ["target_id", "tpm"] + case ("cufflinks", _): + return ["tracking_id", "FPKM"] + case ("stringtie", "gene"): + return ["Gene ID", "TPM"] + case ("stringtie", "transcript"): + return ["transcript_id", "TPM"] + case ("custom", _): + return [id_column, exp_column] + case _: + raise Exception("We don't support {format} with mode as {mode} yet.") + + +########################## Genecode for star parser ########################## +def count_to_TPM(genecode_path, exp_df, no_version=False): gc = pr.read_gtf(genecode_path, as_df=True) - gc = gc[gc["gene_id"].isin(star_genecount["gene_id"])] + if no_version: + gc.loc[:, "gene_id"] = gc.loc[:, "gene_id"].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) + gc = gc[gc["gene_id"].isin(exp_df.iloc[:, 0])] gc = gc[(gc.Feature == "gene")] exon = gc[["Chromosome", "Start", "End", "gene_id", "gene_name"]] # Convert columns to proper types. - exon.Start = exon.Start.astype(int) - exon.End = exon.End.astype(int) + exon.loc[:, "Start"] = exon.loc[:, "Start"].astype(int) + exon.loc[:, "End"] = exon.loc[:, "End"].astype(int) # Sort in place. - exon.sort_values(by=["Chromosome", "Start", "End"], inplace=True) - # Group the rows by the Ensembl gene identifier (with version numbers.) + exon = exon.sort_values(by=["Chromosome", "Start", "End"]) + # Group the rows by the Ensembl gene identifier. groups = exon.groupby("gene_id") - lengths = groups.apply(count_bp) # Create a new DataFrame with gene lengths and EnsemblID. - ensembl_no_version = lengths.index - ldf = pd.DataFrame({"length": lengths.values, "gene_id": ensembl_no_version}) - star_tpm = pd.merge(ldf, star_genecount, how="inner", on="gene_id") - star_tpm["TPM_" + expression_column] = calculate_TPM(star_tpm, expression_column) - return star_tpm + gene_id = lengths.index + if len(gene_id) != len(exp_df): + raise Exception("Duplicated gene id in expression file") + ldf = pd.DataFrame({"length": lengths.values, "gene_id": gene_id}).sort_values(by="gene_id") + # Calculate TPM + tpm = calculate_TPM(exp_df, ldf) + tpm_dict = {k: v for k, v in zip(exp_df.iloc[:, 0], tpm)} + return tpm_dict -def calculate_TPM(df, strand): - effLen = (df["length"]) / 1000 - rate = df[strand] / effLen - denom = sum(rate) / (10**6) - return rate / denom +def calculate_TPM(exp_df, ldf): + "TPM = (reads_transcript / transcript_length_kb) / scaled_total_mapped_reads" + transcript_length_kb = ldf["length"] / 1000 + RPK = exp_df.iloc[:, 1] / transcript_length_kb + scale_factor_permil = sum(RPK) / (10**6) + return RPK / scale_factor_permil def count_bp(df): @@ -258,47 +181,113 @@ def count_bp(df): return sum(bp) -def parse_expression_file(args, vcf_reader, vcf_writer): - if args.format == "stringtie" and args.mode == "transcript": - df_all = pr.read_gtf(args.expression_file) - df = df_all[df_all["feature"] == "transcript"] - id_column = resolve_stringtie_id_column(args, df.columns.values) - elif args.format == "star": - id_column = resolve_id_column(args) - star_genecount = genecount_reader(args.expression_file) - df = genecode_reader(args.genecode, star_genecount) +########################## Expression file parser ########################## +def colnames_check(names, exp_names): + return set(names).issubset(set(exp_names)) + + +def expression_file_parser( + path, format, mode, id_column, exp_column, no_version=False, genecode="" +): + col_names = get_expected_column_names(format, mode, id_column, exp_column) + if isinstance(col_names[0], int): + exp_df = pd.read_csv(path, skiprows=4, sep="\t", header=None) + exp_df = exp_df.iloc[:, col_names] + exp_df = exp_df.sort_values(by=0) + if no_version: + exp_df.iloc[:, 0] = exp_df.iloc[:, 0].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) + tpm_dict = count_to_TPM(genecode, exp_df, no_version) else: - id_column = resolve_id_column(args) - df = pd.read_csv(args.expression_file, sep="\t") - if args.ignore_ensembl_id_version: - df["transcript_without_version"] = df[id_column].apply( - lambda x: re.sub(r"\.[0-9]+$", "", x) + if format == "stringtie" and mode == "transcript": + exp_df = pr.read_gtf(path, as_df=True) + exp_df = exp_df[exp_df["feature"] == "transcript"] + else: + exp_df = pd.read_csv(path, sep="\t", header=0) + + if colnames_check(col_names, exp_df.columns): + exp_df = exp_df.loc[:, col_names] + else: + raise Exception( + "Gene id column or expression column is not presense in expression file" + ) + + if no_version: + exp_df.iloc[:, 0] = exp_df.iloc[:, 0].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) + tpm_dict = {k: v for k, v in zip(exp_df.iloc[:, 0], exp_df.iloc[:, 1])} + return tpm_dict + + +########################## Dna Vcf parser ########################## +def dna_vcf_parser(path, sample_name): + vcf_reader = Reader.from_path(path) + is_multi_sample = len(vcf_reader.header.samples.names) > 1 + if is_multi_sample and sample_name is None: + vcf_reader.close() + raise Exception( + "ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format( + path + ) ) - expression_column = resolve_expression_column(args) - if expression_column not in df.columns.values: + elif is_multi_sample and sample_name not in vcf_reader.header.samples.names: vcf_reader.close() - vcf_writer.close() raise Exception( - "ERROR: expression_column header {} does not exist in expression_file {}".format( - expression_column, args.expression_file + "ERROR: VCF {} does not contain a sample column for sample {}.".format( + path, sample_name ) ) - if id_column not in df.columns.values: + if "CSQ" not in vcf_reader.header.info_ids(): vcf_reader.close() - vcf_writer.close() raise Exception( - "ERROR: id_column header {} does not exist in expression_file {}".format( - id_column, args.expression_file + "ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format( + path ) ) - if args.ignore_ensembl_id_version: - df[id_column] = df[id_column].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) - exp_df = df[[id_column, expression_column]] - exp_dict = exp_df.set_index(id_column)[expression_column].to_dict() - return exp_dict + return vcf_reader, is_multi_sample + + +########################## Rna Vcf parser ########################## +def rna_vcf_parser(path): + rna_vcf_reader = Reader.from_path(path) + return rna_vcf_reader + + +########################## Add expression values functions ########################## +def to_array(dictionary): + array = [] + for key, value in dictionary.items(): + array.append("{}|{}".format(key, value)) + return sorted(array) + + +def add_Expression_to_vcf( + entry, + is_multi_sample, + sample_name, + exp_dict, + genes, + tag, + no_version, + missing_expressions_count, + entry_count, +): + expressions = {} + for gene in genes: + entry_count += 1 + if no_version: + gene = re.sub(r"\.[0-9]+$", "", gene) + if gene in exp_dict: + expressions[gene] = exp_dict[gene] + else: + missing_expressions_count += 1 + if is_multi_sample: + entry.FORMAT += [tag] + entry.call_for_sample[sample_name].data[tag] = to_array(expressions) + else: + entry.add_format(tag, to_array(expressions)) + return (entry, missing_expressions_count, entry_count) -def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf): +def add_AD_to_vcf(entry, is_multi_sample, sample_name, rna_vcf): RAD_temp = [] ROT = [] try: @@ -331,132 +320,107 @@ def add_AD_rna_file(entry, is_multi_sample, sample_name, rna_vcf): entry.FORMAT += ["RAD"] entry.call_for_sample[sample_name].data["RAD"] = RAD_temp entry.FORMAT += ["ROT"] - entry.call_for_sample[sample_name].data["ROT"] = ROT + entry.call_for_sample[sample_name].data["ROT"] = ROT_temp else: entry.add_format("RAD", RAD_temp) - entry.add_format("ROT", ROT) + entry.add_format("ROT", ROT_temp) return entry -def add_expressions( - entry, - is_multi_sample, - sample_name, - exp_dict, - items, - tag, - ignore_ensembl_id_version, - missing_expressions_count, - entry_count, -): - expressions = {} - for item in items: - entry_count += 1 - if ignore_ensembl_id_version: - item = re.sub(r"\.[0-9]+$", "", item) - if item in exp_dict: - expressions[item] = exp_dict[item] - else: - missing_expressions_count += 1 - if is_multi_sample: - entry.FORMAT += [tag] - entry.call_for_sample[sample_name].data[tag] = to_array(expressions) - else: - entry.add_format(tag, to_array(expressions)) - return (entry, missing_expressions_count, entry_count) - - -########################## Define tool parameters ########################## -def define_parser(): - parser = argparse.ArgumentParser( - "comb_rna.py", - description="A tool that will add the data from several expression tools' output files" - + "and allelic depths from mRNA sequencing data for snvs" - + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star" - + "and Cufflinks. There also is a ``custom`` option to annotate with data " - + "from any tab-delimited file.", - ) - - parser.add_argument("--input_vcf", help="A VEP-annotated VCF file") - parser.add_argument("--expression_file", help="A TSV file containing expression estimates") - parser.add_argument("--genecode", help="A genecode file for calculate TPM from star gene count") - parser.add_argument( - "--rna-vcf", - help="A VCF file of RNA-seq data", - ) - parser.add_argument( - "--format", - choices=["kallisto", "stringtie", "cufflinks", "star", "custom"], - help="The file format of the expression file to process. " - + "Use `custom` to process file formats not explicitly supported. " - + "The `custom` option requires the use of the --id-column and --expression-column arguments.", - ) - parser.add_argument( - "--mode", - choices=["gene", "transcript"], - help="The type of expression data in the expression_file", - ) - parser.add_argument( - "-i", - "--id-column", - help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.", - ) - parser.add_argument( - "-e", - "--expression-column", - help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format.", - ) - parser.add_argument( - "-s", - "--sample-name", - help="If the input_vcf contains multiple samples, the name of the sample to annotate.", - ) - parser.add_argument( - "-o", - "--output-vcf", - help="Path to write the output VCF file. If not provided, the output VCF file will be " - + "written next to the input VCF file with a .tx.vcf or .gx.vcf file ending.", - ) - parser.add_argument( - "--ignore-ensembl-id-version", - help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").', - action="store_true", - ) - - return parser - - -def args_check(args): - if args.format == "custom": - if args.id_column is None: - raise Exception( - "--id-column is not set. This is required when using the `custom` format." +########################## Add fields to header ########################## +def create_vcf_writer(path, vcf_reader, mode, rna_vcf, output_vcf): + (head, _, tail) = path.rpartition(".vcf") + new_header = vcf_reader.header.copy() + if mode == "gene": + new_header.add_format_line( + OrderedDict( + [ + ("ID", "GX"), + ("Number", "."), + ("Type", "Float"), + ("Description", "Gene Expressions"), + ] ) - if args.expression_column is None: - raise Exception( - "--expression-column is not set. This is required when using the `custom` format." + ) + output_file = ("").join([head, ".gx.vcf", tail]) + elif mode == "transcript": + new_header.add_format_line( + OrderedDict( + [ + ("ID", "TX"), + ("Number", "."), + ("Type", "Float"), + ("Description", "Transcript Expressions"), + ] ) - if args.format == "star": - if args.genecode is None: - raise Exception("--genecode is not set. This is required when using the `star` format") + ) + output_file = ("").join([head, ".tx.vcf", tail]) + if rna_vcf: + new_header.add_format_line( + OrderedDict( + [ + ("ID", "RAD"), + ("Number", "A"), + ("Type", "Integer"), + ( + "Description", + "Allelic depths from mRNA sequencing data of the same sample", + ), + ] + ) + ) + new_header.add_format_line( + OrderedDict( + [ + ("ID", "ROT"), + ("Number", "1"), + ("Type", "Integer"), + ( + "Description", + "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample", + ), + ] + ) + ) + (head, _, tail) = output_file.rpartition(".vcf") + output_file = ("").join([head, ".pileup.vcf", tail]) + if output_vcf: + output_file = output_vcf + return Writer.from_path(output_file, new_header) +########################## Generate new vcf file ########################## def adding_extra_information(args): - (vcf_reader, rna_vcf_reader, is_multi_sample) = create_vcf_reader(args) - # Only get contigs from vcf input file - CONTIGS = [x.id for x in vcf_reader.header.get_lines("contig")] + dna_vcf, is_multi_sample = dna_vcf_parser(args.input_vcf, args.sample_name) + if args.rna_vcf: + rna_vcf = rna_vcf_parser(args.rna_vcf) + # CONTIGS = [x.id for x in dna_vcf.header.get_lines("contig")] format_pattern = re.compile("Format: (.*)") csq_format = ( - format_pattern.search(vcf_reader.header.get_info_field_info("CSQ").description) + format_pattern.search(dna_vcf.header.get_info_field_info("CSQ").description) .group(1) .split("|") ) + if args.ignore_ensembl_id_version: + no_version = True + + vcf_writer = create_vcf_writer( + args.input_vcf, dna_vcf, args.mode, args.rna_vcf, args.output_vcf + ) + exp_dict = expression_file_parser( + args.expression_file, + args.format, + args.mode, + args.id_column, + args.expression_column, + no_version, + args.genecode, + ) - vcf_writer = create_vcf_writer(args, vcf_reader) - exp_dict = parse_expression_file(args, vcf_reader, vcf_writer) missing_expressions_count = 0 entry_count = 0 - for entry in vcf_reader: + + for entry in dna_vcf: # Add expression data transcript_ids = set() genes = set() @@ -478,37 +442,38 @@ def adding_extra_information(args): if args.mode == "gene": genes = list(genes) if len(genes) > 0: - (entry, missing_expressions_count, entry_count) = add_expressions( + (entry, missing_expressions_count, entry_count) = add_Expression_to_vcf( entry, is_multi_sample, args.sample_name, exp_dict, genes, "GX", - args.ignore_ensembl_id_version, + no_version, missing_expressions_count, entry_count, ) elif args.mode == "transcript": transcript_ids = list(transcript_ids) if len(transcript_ids) > 0: - (entry, missing_expressions_count, entry_count) = add_expressions( + (entry, missing_expressions_count, entry_count) = add_Expression_to_vcf( entry, is_multi_sample, args.sample_name, exp_dict, transcript_ids, "TX", + no_version, missing_expressions_count, entry_count, ) # Add RAD and ROT - if rna_vcf_reader is not None: - if (entry.CHROM in CONTIGS) and (entry.is_snv()): - entry = add_AD_rna_file(entry, is_multi_sample, args.sample_name, rna_vcf_reader) + if rna_vcf is not None: + if entry.is_snv(): + entry = add_AD_to_vcf(entry, is_multi_sample, args.sample_name, rna_vcf) vcf_writer.write_record(entry) - vcf_reader.close() + dna_vcf.close() vcf_writer.close() if missing_expressions_count > 0: @@ -519,10 +484,10 @@ def adding_extra_information(args): ) +########################## MAIN ########################## def main(args_input=sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) - args_check(args) adding_extra_information(args) diff --git a/snappy_wrappers/wrappers/pvactools/combining/environment.yaml b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml index 2ea441ab8..47a97a8e3 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/environment.yaml +++ b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml @@ -2,9 +2,9 @@ channels: - conda-forge - bioconda dependencies: - - vcfpy - - numpy - - pyranges - - pandas - - python - - bcftools + - vcfpy>=0.12.3 + - numpy>=1.25.2, <1.26 + - pyranges>=0.0.129, <0.1 + - pandas>=2.1.1, <2.2 + - python>=3.10.8, <3.11 + - bcftools>=1.17, <1.18 diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index 3d698637f..86b15643f 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -19,13 +19,24 @@ "comb_rna.py", ) ensemble_id = ( - "--ignore-ensembl-id-version" if config["preparation"]["ignore-ensembl-id-version"] else "" + "--ignore-ensembl-id-version" if snakemake.params.args["ignore_ensembl_id_version"] else "" ) -id_column = ( - "--id-column " + str(config["preparation"]["id-column"]) - if config["preparation"]["format"] == "custom" +max_depth = snakemake.params.args["max_depth"] +format = snakemake.params.args["format"] +mode = snakemake.params.args["mode"] +expression_column = snakemake.params.args["expression_column"] +id_column = snakemake.params.args["id_column"] + +rna_vcf = ( + f"--rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz " + if format == "snappy_custom" else "" ) +expression_file = ( + f"--expression-file {snakemake.input.expression} " + if format == "snappy_custom" + else config["preparation"]["expression_file"] +) shell( r""" @@ -53,32 +64,34 @@ conda list > {snakemake.log.conda_list} conda info > {snakemake.log.conda_info} -# Getting region of SNVs only -bcftools filter -i 'TYPE="snp"' {snakemake.input.vcf} | bcftools query -f '%CHROM\t%POS0\t%END\n' > $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed +if [[ {format}=="snappy_custom" ]]; then + # Getting region of SNVs only + bcftools filter -i 'TYPE="snp"' {snakemake.input.vcf} | bcftools query -f '%CHROM\t%POS0\t%END\n' > $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed -# Getting snvs from RNA sequencing data -bcftools mpileup -Ov \ - --annotate FORMAT/AD,FORMAT/DP \ - -R $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed \ - -f {snakemake.config[static_data_config][reference][path]} \ - --per-sample-mF \ - --max-depth 4000 \ - --redo-BAQ -Oz \ - -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \ - {snakemake.input.bam} -tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz + # Getting snvs from RNA sequencing data + bcftools mpileup -Ov \ + --annotate FORMAT/AD,FORMAT/DP \ + -R $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed \ + -f {snakemake.config[static_data_config][reference][path]} \ + --per-sample-mF \ + --max-depth {max_depth} \ + --redo-BAQ -Oz \ + -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \ + {snakemake.input.bam} + tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz +fi #Need to add option for RNA vcf as well -python3 -W ignore {preparation_vcf} \ +python3 {preparation_vcf} \ --genecode {snakemake.config[static_data_config][features][path]} \ - --input_vcf {snakemake.input.vcf} --format {config[preparation][format]} \ - --rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \ - --expression_file {snakemake.input.expression} \ - --mode {config[preparation][mode]} \ - -e {config[preparation][expression-column]} \ + --input-vcf {snakemake.input.vcf} --format {format} \ + --mode {mode} \ + -e {expression_column} \ + -i {id_column} \ -s {snakemake.wildcards.tumor_library} \ -o /dev/stdout \ - "{id_column}" \ + {expression_file}\ + {rna_vcf}\ "{ensemble_id}" \ | bgzip -c > {snakemake.output.vcf} tabix -f {snakemake.output.vcf} From 23101e82ea3a462b560135eb73626da756c02c53 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Tue, 18 Jun 2024 22:12:48 +0200 Subject: [PATCH 11/23] Update test --- .../test_workflows_somatic_neoepitope_prediction.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py index 960aff0e3..eebb03de0 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py @@ -64,13 +64,8 @@ def minimal_config(): tools_somatic_variant_calling: [] max_depth: "4000" preparation: - format: 'star' - path_features: '' + format: 'snappy_custom' mode: 'gene' - id-column: '' - expression-column: 'fr' - ignore-ensembl-id-version: True - data_sets: first_batch: file: sheet.tsv From 0920803c1aa1b3414bb45d8359366dfc83f57374 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Mon, 1 Jul 2024 14:39:39 +0200 Subject: [PATCH 12/23] Adding plugins option to VEP --- .../somatic_variant_annotation/__init__.py | 2 ++ .../wrappers/pvactools/combining/comb_rna.py | 4 ++-- snappy_wrappers/wrappers/vep/run/wrapper.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py index 9b599887d..f37050337 100644 --- a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py @@ -144,6 +144,8 @@ buffer_size: 1000 output_options: - everything + plugins: [] #Two required plugins for running pvactool are Frameshift and Wildtype + plugins_dir: "" """ diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py index b8017b5d3..8b3658c14 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -337,7 +337,7 @@ def create_vcf_writer(path, vcf_reader, mode, rna_vcf, output_vcf): [ ("ID", "GX"), ("Number", "."), - ("Type", "Float"), + ("Type", "String"), ("Description", "Gene Expressions"), ] ) @@ -349,7 +349,7 @@ def create_vcf_writer(path, vcf_reader, mode, rna_vcf, output_vcf): [ ("ID", "TX"), ("Number", "."), - ("Type", "Float"), + ("Type", "String"), ("Description", "Transcript Expressions"), ] ) diff --git a/snappy_wrappers/wrappers/vep/run/wrapper.py b/snappy_wrappers/wrappers/vep/run/wrapper.py index 82c5f1534..eb7f4bd44 100644 --- a/snappy_wrappers/wrappers/vep/run/wrapper.py +++ b/snappy_wrappers/wrappers/vep/run/wrapper.py @@ -12,6 +12,15 @@ vep_config = snakemake.config["step_config"][current_step]["vep"] pick_order = ",".join(vep_config["pick_order"]) script_output_options = " ".join(["--" + x for x in vep_config["output_options"]]) +if vep_config["plugins"]: + plugins = " ".join(["--plugin " + x for x in vep_config["plugins"]]) + if not vep_config["plugins_dir"]: + raise Exception("Please provide plugins directory if you want to use plugins") + else: + plugins_dir = "--dir_plugins " + vep_config["plugins_dir"] +else: + plugins = "" + plugins_dir = "" full = snakemake.output.full if "full" in snakemake.output.keys() else "" @@ -44,6 +53,8 @@ echo --dir_cache {vep_config[cache_dir]} fi) \ {script_output_options} \ + {plugins} \ + {plugins_dir} \ --{vep_config[tx_flag]} \ --fasta {snakemake.config[static_data_config][reference][path]} \ --input_file {snakemake.input.vcf} --format vcf \ @@ -64,6 +75,8 @@ echo --dir_cache {vep_config[cache_dir]} fi) \ {script_output_options} \ + {plugins} \ + {plugins_dir} \ --pick --pick_order {pick_order} \ --{vep_config[tx_flag]} \ --fasta {snakemake.config[static_data_config][reference][path]} \ From 0d95ea3652df8515894bab8bdf5b5090fb32b650 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 3 Jul 2024 16:02:29 +0200 Subject: [PATCH 13/23] Adding plugin options for vep --- snappy_pipeline/models/annotation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py index 468f0bf96..22d7d1a9c 100644 --- a/snappy_pipeline/models/annotation.py +++ b/snappy_pipeline/models/annotation.py @@ -35,4 +35,7 @@ class Vep(SnappyModel): ] num_threads: int = 8 buffer_size: int = 1000 + plugins: list[str] = [] + """To use this option in VEP, you should download the plugin repository from the link https://github.com/Ensembl/VEP_plugins""" + plugins_dir: str = "" output_options: list[str] = ["everything"] From 1d725626987a118b47922d9784ed2576dc1bd55c Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 3 Jul 2024 16:03:05 +0200 Subject: [PATCH 14/23] Reformat neoepitope prediction for new snappy version --- snappy_pipeline/workflow_model.py | 2 + .../somatic_neoepitope_prediction/__init__.py | 63 ++++++------------- .../somatic_neoepitope_prediction/model.py | 53 ++++++++++++++++ 3 files changed, 75 insertions(+), 43 deletions(-) create mode 100644 snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py index aeda7682e..24c04210e 100644 --- a/snappy_pipeline/workflow_model.py +++ b/snappy_pipeline/workflow_model.py @@ -36,6 +36,7 @@ from snappy_pipeline.workflows.somatic_variant_filtration.model import SomaticVariantFiltration from snappy_pipeline.workflows.somatic_variant_signatures.model import SomaticVariantSignatures from snappy_pipeline.workflows.somatic_wgs_cnv_calling.model import SomaticWgsCnvCalling +from snappy_pipeline.workflows.somatic_neoepitope_prediction.model import SomaticNeoepitopePrediction from snappy_pipeline.workflows.somatic_wgs_sv_calling.model import SomaticWgsSvCalling from snappy_pipeline.workflows.sv_calling_targeted.model import SvCallingTargeted from snappy_pipeline.workflows.sv_calling_wgs.model import SvCallingWgs @@ -113,6 +114,7 @@ class StepConfig(TypedDict, total=False): somatic_gene_fusion_calling: SomaticGeneFusionCalling somatic_hla_loh_calling: SomaticHlaLohCalling somatic_msi_calling: SomaticMsiCalling + somatic_neoepitope_prediction: SomaticNeoepitopePrediction somatic_purity_ploidy_estimate: SomaticPurityPloidyEstimate somatic_targeted_seq_cnv_calling: SomaticTargetedSeqCnvCalling somatic_variant_annotation: SomaticVariantAnnotation diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index a252eb961..8bb322f27 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -46,6 +46,8 @@ SOMATIC_VARIANT_CALLERS_MATCHED, SomaticVariantCallingWorkflow, ) +from .model import SomaticNeoepitopePrediction as SomaticNeoepitopePredictionConfigModel + __author__ = "Pham Gia Cuong" __email__ = "pham.gia-cuong@bih-charite.de" @@ -54,28 +56,7 @@ EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") #: Default configuration for the somatic_gene_fusion_calling step -DEFAULT_CONFIG = r""" -step_config: - somatic_neoepitope_prediction: - path_somatic_variant_annotation: ../somatic_variant_annotation # REQUIRED - path_rna_ngs_mapping: ../ngs_mapping - tools_somatic_variant_annotation: ["vep"] - tools_rna_mapping: [] # deafult to those configured for ngs_mapping - tools_ngs_mapping: [] # deafult to those configured for ngs_mapping - tools_somatic_variant_calling: [] # deafult to those configured for somatic_variant_calling - preparation: - format: 'snappy_custom' # REQUIRED - The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom) - # Use `custom` to process file formats not explicitly supported. - # The `custom` option requires the use of the --id-column and --expression-column arguments. - expression_file: '' # expression file path. If the format is not snappy_custom - path_features: '' # Gencode file path, required for star and snappy format - mode: 'gene' # REQUIRED - Determine whether the expression file contains gene or transcript TPM values. - full_vep_annotation: True # The somatic_variant_annotation has been done in fully annotation mode - id_column: # Gene/Transcript id column name. Need when using the `custom` format. - expression_column: # Expression column name. Need when using the `custom` format. - ignore_ensembl_id_version: True #Ignore the ensemble id version - max_depth: 4000 -""" +DEFAULT_CONFIG = SomaticNeoepitopePredictionConfigModel.default_config_yaml_string() class NeoepitopePreparationStepPart(BaseStepPart): @@ -183,7 +164,7 @@ def _get_log_file(self, action): for key, ext in key_ext: yield key, prefix + ext - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) mem_mb = 6 * 1024 # 6GB return ResourceUsage( @@ -225,41 +206,37 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - ( + config_model_class=SomaticNeoepitopePredictionConfigModel, + previous_steps=( SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, NgsMappingWorkflow, ), ) # Register sub step classes so the sub steps are available + config = self.config self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart)) self.register_sub_workflow( "somatic_variant_annotation", - self.config["path_somatic_variant_annotation"], + config["path_somatic_variant_annotation"], ) self.register_sub_workflow( "ngs_mapping", - self.config["path_rna_ngs_mapping"], + config["path_rna_ngs_mapping"], ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_somatic_variant_calling"]: - self.config["tools_somatic_variant_calling"] = self.w_config["step_config"][ + if not config.tools_ngs_mapping: + config.tools_ngs_mapping = self.w_config["step_config"]["ngs_mapping"].tools.dna + if not config.tools_somatic_variant_calling: + config.tools_somatic_variant_calling = self.w_config["step_config"][ "somatic_variant_calling" - ]["tools"] - if not self.config["tools_somatic_variant_annotation"]: - self.config["tools_somatic_variant_annotation"] = ["vep"] - if not self.config["tools_rna_mapping"]: - self.config["tools_rna_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "rna" - ] - if not self.config["preparation"]["path_features"]: - self.config["preparation"]["path_features"] = self.w_config["static_data_config"][ - "features" - ]["path"] + ].tools + if not config.tools_somatic_variant_annotation: + config.tools_somatic_variant_annotation = ["vep"] + if not config.tools_rna_mapping: + config.tools_rna_mapping = self.w_config["step_config"]["ngs_mapping"].tools.rna + if not config.preparation.path_features: + config.preparation.path_features = self.w_config["static_data_config"].features.path @listify def get_result_files(self): diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py new file mode 100644 index 000000000..61b2ce8f1 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py @@ -0,0 +1,53 @@ +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import SnappyModel, SnappyStepModel + +# class Format: +# stringtie = "stringtie" +# snappy_custom = "snappy_custom" +# cufflinks = "cufflinks" +# kallisto = "kallisto" +# custom = "custom" + + +class Preparation(SnappyModel): + format: Annotated[ + str, Field(examples=["stringtie", "snappy_custom", "cufflinks", "kallisto", "custom"]) + ] = "snappy_custom" + # Literal["stringtie","snappy_custom","cufflinks","kallisto","custom"] + """The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom) + Use `custom` to process file formats not explicitly supported. + The `custom` option requires the use of the --id-column and --expression-column arguments.""" + path_features: str = "" + """Gencode file path, required for star and snappy format""" + mode: Annotated[str, Field(examples=["gene", "transcript"])] = "gene" + """Determine whether the expression file contains gene or transcript TPM values.""" + full_vep_annotation: bool = True + """The somatic_variant_annotation has been done in fully annotation mode""" + id_column: str = "" + """Gene/Transcript id column name. Need when using the `custom` format.""" + expression_column: str = "" + """Expression column name. Need when using the `custom` format.""" + ignore_ensembl_id_version: bool = True + """Ignore the ensemble id version""" + max_depth: int = 4000 + + +class SomaticNeoepitopePrediction(SnappyStepModel): + path_somatic_variant_annotation: Annotated[ + str, Field(examples=["../somatic_variant_annotation"]) + ] + path_rna_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])] + tools_somatic_variant_annotation: list[str] = ["vep"] + tools_ngs_mapping: list[str] = [] + tools_somatic_variant_calling: list[str] = [] + tools_rna_mapping: list[str] = [] + tools_rna_mapping: list[str] = [] + """Deafult to those configured for ngs_mapping""" + tools_ngs_mapping: list[str] = [] + """Deafult to those configured for ngs_mapping""" + tools_somatic_variant_calling: list[str] = [] + """Deafult to those configured for somatic_variant_calling""" + preparation: Preparation | None = None From 6d9257c3b18fce6376fa8a9e6b03d426e64cf63e Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 3 Jul 2024 16:07:40 +0200 Subject: [PATCH 15/23] Satisfies lint --- snappy_pipeline/workflow_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py index 24c04210e..1b1a7fefd 100644 --- a/snappy_pipeline/workflow_model.py +++ b/snappy_pipeline/workflow_model.py @@ -36,7 +36,9 @@ from snappy_pipeline.workflows.somatic_variant_filtration.model import SomaticVariantFiltration from snappy_pipeline.workflows.somatic_variant_signatures.model import SomaticVariantSignatures from snappy_pipeline.workflows.somatic_wgs_cnv_calling.model import SomaticWgsCnvCalling -from snappy_pipeline.workflows.somatic_neoepitope_prediction.model import SomaticNeoepitopePrediction +from snappy_pipeline.workflows.somatic_neoepitope_prediction.model import ( + SomaticNeoepitopePrediction, +) from snappy_pipeline.workflows.somatic_wgs_sv_calling.model import SomaticWgsSvCalling from snappy_pipeline.workflows.sv_calling_targeted.model import SvCallingTargeted from snappy_pipeline.workflows.sv_calling_wgs.model import SvCallingWgs From 999c6ea591a7402ec875f88c91bd23de9b93f486 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Fri, 5 Jul 2024 10:51:42 +0200 Subject: [PATCH 16/23] Adding test for somatic neoepitope prediction preparation substep --- .../somatic_neoepitope_prediction/model.py | 27 +++++++++---------- .../wrappers/pvactools/combining/wrapper.py | 11 +++++--- ...workflows_somatic_neoepitope_prediction.py | 20 +++++++------- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py index 61b2ce8f1..3ccf5080e 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py @@ -1,25 +1,24 @@ +import enum from typing import Annotated from pydantic import Field -from snappy_pipeline.models import SnappyModel, SnappyStepModel +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel -# class Format: -# stringtie = "stringtie" -# snappy_custom = "snappy_custom" -# cufflinks = "cufflinks" -# kallisto = "kallisto" -# custom = "custom" + +class Format(enum.StrEnum): + stringtie = "stringtie" + snappy_custom = "snappy_custom" + cufflinks = "cufflinks" + kallisto = "kallisto" + custom = "custom" class Preparation(SnappyModel): - format: Annotated[ - str, Field(examples=["stringtie", "snappy_custom", "cufflinks", "kallisto", "custom"]) - ] = "snappy_custom" - # Literal["stringtie","snappy_custom","cufflinks","kallisto","custom"] - """The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom) - Use `custom` to process file formats not explicitly supported. - The `custom` option requires the use of the --id-column and --expression-column arguments.""" + format: Annotated[Format, EnumField(Format)] = Format.snappy_custom + "The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)" + "Use `custom` to process file formats not explicitly supported." + "The `custom` option requires the use of the --id-column and --expression-column arguments." path_features: str = "" """Gencode file path, required for star and snappy format""" mode: Annotated[str, Field(examples=["gene", "transcript"])] = "gene" diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index 86b15643f..6e178ee51 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -24,8 +24,11 @@ max_depth = snakemake.params.args["max_depth"] format = snakemake.params.args["format"] mode = snakemake.params.args["mode"] -expression_column = snakemake.params.args["expression_column"] -id_column = snakemake.params.args["id_column"] +expression_column = ( + f"--expression-file {snakemake.params.args['expression_column']}" if format == "custom" else "" +) + +id_column = f"--expression-file {snakemake.params.args['id_column']}" if format == "custom" else "" rna_vcf = ( f"--rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz " @@ -86,8 +89,8 @@ --genecode {snakemake.config[static_data_config][features][path]} \ --input-vcf {snakemake.input.vcf} --format {format} \ --mode {mode} \ - -e {expression_column} \ - -i {id_column} \ + {expression_column} \ + {id_column} \ -s {snakemake.wildcards.tumor_library} \ -o /dev/stdout \ {expression_file}\ diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py index eebb03de0..057308663 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py @@ -38,8 +38,7 @@ def minimal_config(): dna: ['bwa'] rna: ['star'] star: - path_index: /path/to/index - path_features: + path_index: /path/to/star/index bwa: path_index: /path/to/bwa/index.fa @@ -47,22 +46,20 @@ def minimal_config(): tools: - mutect2 - scalpel + mutect2: {} scalpel: path_target_regions: /path/to/target/regions.bed somatic_variant_annotation: + path_somatic_variant_calling: ../somatic_variant_calling tools: ["vep"] vep: - path_dir_cache: /path/to/dir/cache + cache_dir: /path/to/dir/cache somatic_neoepitope_prediction: path_somatic_variant_annotation: ../somatic_variant_annotation path_rna_ngs_mapping: ../ngs_mapping - tools_somatic_variant_annotation: [] - tools_rna_mapping: [] - tools_ngs_mapping: [] - tools_somatic_variant_calling: [] - max_depth: "4000" + tools_somatic_variant_annotation: ["vep"] preparation: format: 'snappy_custom' mode: 'gene' @@ -87,12 +84,15 @@ def somatic_neoepitope_prediction_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping.model", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x, @@ -149,7 +149,7 @@ def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_predi assert actual == expected -def test_neoepitope_preparation_step_part_get_resource_usage( +def test_neoepitope_preparation_step_part_get_resource( somatic_neoepitope_prediction_workflow, ): # Define expected @@ -159,7 +159,7 @@ def test_neoepitope_preparation_step_part_get_resource_usage( msg_error = f"Assertion error for resource '{resource}'." actual = somatic_neoepitope_prediction_workflow.get_resource( "neoepitope_preparation", "run", resource - ) + )() assert actual == expected, msg_error From a140e28ea4cb5b581f5fb109e4378f268c50177a Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 11 Jul 2024 10:29:47 +0200 Subject: [PATCH 17/23] Preparation for pvactool --- .../somatic_neoepitope_prediction/Snakefile | 18 +-- .../somatic_neoepitope_prediction/__init__.py | 106 ++++++++++++------ .../somatic_neoepitope_prediction/model.py | 2 +- .../wrappers/pvactools/combining/wrapper.py | 7 +- 4 files changed, 85 insertions(+), 48 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile index 326510b4e..de398ddc7 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -48,18 +48,18 @@ rule neoepitope_preparation_link_out_run: rule neoepitope_preparation: input: - **wf.get_input_files("neoepitope_preparation", "run"), + unpack(wf.get_input_files("pvacseq","prepare")), output: - **wf.get_output_files("neoepitope_preparation", "run"), + **wf.get_output_files("pvacseq","prepare"), log: - **wf.get_log_file("neoepitope_preparation", "run"), - threads: wf.get_resource("neoepitope_preparation", "run", "threads") + **wf.get_log_file("pvacseq","prepare"), + threads: wf.get_resource("pvacseq","prepare", "threads") resources: - time=wf.get_resource("neoepitope_preparation", "run", "time"), - memory=wf.get_resource("neoepitope_preparation", "run", "memory"), - partition=wf.get_resource("neoepitope_preparation", "run", "partition"), - tmpdir=wf.get_resource("neoepitope_preparation", "run", "tmpdir"), + time=wf.get_resource("pvacseq","prepare", "time"), + memory=wf.get_resource("pvacseq","prepare", "memory"), + partition=wf.get_resource("pvacseq","prepare", "partition"), + tmpdir=wf.get_resource("pvacseq","prepare", "tmpdir"), params: - **{"args": wf.get_params("neoepitope_preparation", "run")}, + **{"args": wf.get_params("pvacseq","prepare")}, wrapper: wf.wrapper_path("pvactools/combining") diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index 8bb322f27..cfb3bb82c 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -59,14 +59,14 @@ DEFAULT_CONFIG = SomaticNeoepitopePredictionConfigModel.default_config_yaml_string() -class NeoepitopePreparationStepPart(BaseStepPart): +class PvacSeqStepPart(BaseStepPart): """ Preparation VCF file for pvactool """ #: Step name - name = "neoepitope_preparation" - actions = ("run",) + name = "pvacseq" + actions = ("prepare","predict") def __init__(self, parent): super().__init__(parent) @@ -82,44 +82,74 @@ def __init__(self, parent): for donor in sheet.donors: self.donors[donor.name] = donor - @dictify def get_input_files(self, action): + self._validate_action(action) + return self._get_input_files_prepare + + # @dictify + # def _get_input_files_predict(self,wildcards): + # prepare_tpl = ( + # "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/" + # "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.vcf.gz" + # ) + # yield "combine_vcf",prepare_tpl + # hla_typing = self.parent.sub_workflows["hla_typing"] + # hla_tpl = { + # "output/{hla_tool}.{ngs_library}/out/{hla_tool}.{ngs_library}.txt" + # } + # for sheet in filter(is_not_background, self.parent.shortcut_sheets): + # for sample_pair in sheet.all_sample_pairs: + # if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: + # yield "hla_normal_dna", hla_typing((hla_tpl).format( + # hla_tool=self.w_config.step_config["hla_typing"].tools[0], + # ngs_library=sample_pair.normal_sample.dna_ngs_library.name,) + # ) + # yield "hla_tumor_dna", hla_typing((hla_tpl).format( + # hla_tool=self.w_config.step_config["hla_typing"].tools[0], + # ngs_library=sample_pair.tumor_sample.dna_ngs_library.name,) + # ) + # yield "hla_tumor_rna", hla_typing((hla_tpl).format( + # hla_tool=self.w_config.step_config["hla_typing"].tools[0], + # ngs_library=sample_pair.tumor_sample.rna_ngs_library.name,) + # ) + # pass + + @dictify + def _get_input_files_prepare(self, wildcards): """Return path to somatic variant annotated file""" # It only works with vep now. # Validate action - self._validate_action(action) + # self._validate_action(action) tpl = ( "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" ) if self.config["preparation"]["full_vep_annotation"]: - key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"} + vep_key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"} else: - key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} + vep_key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"] - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - for key, ext in key_ext.items(): - yield key, variant_annotation(tpl + ext) - # Getting appropriate rna library if self.config["preparation"]["format"] == "snappy_custom": - for sheet in filter(is_not_background, self.parent.sheets): - for donor in sheet.bio_entities.values(): - for biosample in donor.bio_samples.values(): - if biosample.extra_infos["isTumor"]: - for test_sample in biosample.test_samples.values(): - if test_sample.extra_infos["extractionType"] == "RNA": - for lib in test_sample.ngs_libraries.values(): - rna_library = lib.name - rna_tpl = ( - "output/{mapper}.{library_name}/out/{mapper}.{library_name}" - ).format( - mapper="star", - library_name=rna_library, + for key,ext in vep_key_ext.items(): + yield key,variant_annotation(tpl+ext) + + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + + star_key_ext = {"expression": ".GeneCounts.tab", + "bam": ".bam"} + for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: + rna_tpl = ( + "output/{mapper}.{rna_library}/out/{mapper}.{rna_library}" + ) + for key,ext in star_key_ext.items(): + yield key, ngs_mapping((rna_tpl + ext).format( + mapper=self.w_config.step_config["ngs_mapping"].tools.rna[0], + rna_library=sample_pair.tumor_sample.rna_ngs_library.name,) ) - ext = {"expression", "bam", "bai"} - yield "expression", ngs_mapping(rna_tpl + ".GeneCounts.tab") - yield "bam", ngs_mapping(rna_tpl + ".bam") - yield "bai", ngs_mapping(rna_tpl + ".bam.bai") + + @dictify def get_output_files(self, action): @@ -128,12 +158,12 @@ def get_output_files(self, action): self._validate_action(action) if self.config["preparation"]["mode"] == "gene": prefix = ( - "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) elif self.config["preparation"]["mode"] == "transcript": prefix = ( - "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/" + "work/prepare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" ) key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} @@ -148,12 +178,12 @@ def _get_log_file(self, action): self._validate_action(action) if self.config["preparation"]["mode"] == "gene": prefix = ( - "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) elif self.config["preparation"]["mode"] == "transcript": prefix = ( - "work/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" + "work/preprare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" ) key_ext = ( @@ -175,7 +205,10 @@ def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: def get_params(self, action): self._validate_action(action) - return { + return getattr(self,"_get_params_run") + + def _get_params_run(self,wildcards): + d = { "max_depth": self.config["preparation"]["max_depth"], "format": self.config["preparation"]["format"], "expression_column": self.config["preparation"]["expression_column"], @@ -183,6 +216,7 @@ def get_params(self, action): "ignore_ensembl_id_version": self.config["preparation"]["ignore_ensembl_id_version"], "mode": self.config["preparation"]["mode"], } + return d class SomaticNeoepitopePredictionWorkflow(BaseStep): @@ -215,7 +249,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) # Register sub step classes so the sub steps are available config = self.config - self.register_sub_step_classes((NeoepitopePreparationStepPart, LinkOutStepPart)) + self.register_sub_step_classes((PvacSeqStepPart, LinkOutStepPart)) self.register_sub_workflow( "somatic_variant_annotation", config["path_somatic_variant_annotation"], @@ -247,14 +281,14 @@ def get_result_files(self): elif self.config["preparation"]["mode"] == "transcript": name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}" yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + os.path.join("output/prepare", name_pattern, "out", name_pattern + "{ext}"), mapper=self.config["tools_ngs_mapping"], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, ext=EXT_VALUES, ) yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + os.path.join("output/prepare", name_pattern, "log", name_pattern + "{ext}"), mapper=self.config["tools_ngs_mapping"], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py index 3ccf5080e..7abe6815f 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py @@ -38,12 +38,12 @@ class SomaticNeoepitopePrediction(SnappyStepModel): path_somatic_variant_annotation: Annotated[ str, Field(examples=["../somatic_variant_annotation"]) ] + path_hla_typing : Annotated[str, Field(examples=["../hla_typing"])] path_rna_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])] tools_somatic_variant_annotation: list[str] = ["vep"] tools_ngs_mapping: list[str] = [] tools_somatic_variant_calling: list[str] = [] tools_rna_mapping: list[str] = [] - tools_rna_mapping: list[str] = [] """Deafult to those configured for ngs_mapping""" tools_ngs_mapping: list[str] = [] """Deafult to those configured for ngs_mapping""" diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py index 6e178ee51..f5a48fdea 100644 --- a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -30,13 +30,16 @@ id_column = f"--expression-file {snakemake.params.args['id_column']}" if format == "custom" else "" +bam_file = snakemake.input.bam +gencount_file = snakemake.input.expression + rna_vcf = ( f"--rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz " if format == "snappy_custom" else "" ) expression_file = ( - f"--expression-file {snakemake.input.expression} " + f"--expression-file {gencount_file} " if format == "snappy_custom" else config["preparation"]["expression_file"] ) @@ -80,7 +83,7 @@ --max-depth {max_depth} \ --redo-BAQ -Oz \ -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \ - {snakemake.input.bam} + {bam_file} tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz fi From 03bd9e429272ad258a2aa88f1edd7886baabd1a9 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 11 Jul 2024 15:52:07 +0200 Subject: [PATCH 18/23] Fix HLA_typing pipeline --- .../workflows/hla_typing/Snakefile | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/snappy_pipeline/workflows/hla_typing/Snakefile b/snappy_pipeline/workflows/hla_typing/Snakefile index 0f627c46d..5b07da66d 100644 --- a/snappy_pipeline/workflows/hla_typing/Snakefile +++ b/snappy_pipeline/workflows/hla_typing/Snakefile @@ -53,7 +53,6 @@ rule hla_typing_link_in_run: # Generic linking out --------------------------------------------------------- - rule hla_typing_link_out_run: input: wf.get_input_files("link_out", "run"), @@ -67,51 +66,52 @@ rule hla_typing_link_out_run: # OptiType -------------------------------------------------------------------- - -rule hla_typing_optitype_run: - input: - **wf.get_input_files("optitype", "run"), - output: - **wf.get_output_files("optitype", "run"), - threads: wf.get_resource("optitype", "run", "threads") - resources: - time=wf.get_resource("optitype", "run", "time"), - memory=wf.get_resource("optitype", "run", "memory"), - partition=wf.get_resource("optitype", "run", "partition"), - tmpdir=wf.get_resource("optitype", "run", "tmpdir"), - log: - **wf.get_log_file("optitype", "run"), - params: - args=wf.substep_dispatch("optitype", "get_args", "run"), - wrapper: - wf.wrapper_path("optitype") +if "optitype" in wf.w_config.step_config["hla_typing"].tools: + rule hla_typing_optitype_run: + input: + **wf.get_input_files("optitype", "run"), + output: + **wf.get_output_files("optitype", "run"), + threads: wf.get_resource("optitype", "run", "threads") + resources: + time=wf.get_resource("optitype", "run", "time"), + memory=wf.get_resource("optitype", "run", "memory"), + partition=wf.get_resource("optitype", "run", "partition"), + tmpdir=wf.get_resource("optitype", "run", "tmpdir"), + log: + **wf.get_log_file("optitype", "run"), + params: + args=wf.substep_dispatch("optitype", "get_args", "run"), + wrapper: + wf.wrapper_path("optitype") # arcasHLA -------------------------------------------------------------------- # NB: reference is updated in the installed package -rule hla_typing_arcashla_prepare_reference: - output: - touch("work/arcashla.prepare_reference/out/.done"), - log: - "work/arcashla.prepare_reference/log/arcashla.prepare_reference.log", - wrapper: - wf.wrapper_path("arcashla/prepare_reference") - - -rule hla_typing_arcashla_run: - input: - unpack(wf.get_input_files("arcashla", "run")), - output: - **wf.get_output_files("arcashla", "run"), - threads: wf.get_resource("arcashla", "run", "threads") - resources: - time=wf.get_resource("arcashla", "run", "time"), - memory=wf.get_resource("arcashla", "run", "memory"), - partition=wf.get_resource("arcashla", "run", "partition"), - tmpdir=wf.get_resource("arcashla", "run", "tmpdir"), - log: - wf.get_log_file("arcashla", "run"), - wrapper: - wf.wrapper_path("arcashla/run") +if "arcashla" in wf.w_config.step_config["hla_typing"].tools: + rule hla_typing_arcashla_prepare_reference: + output: + touch("work/arcashla.prepare_reference/out/.done"), + log: + "work/arcashla.prepare_reference/log/arcashla.prepare_reference.log", + wrapper: + wf.wrapper_path("arcashla/prepare_reference") + + + rule hla_typing_arcashla_run: + input: + unpack(wf.get_input_files("arcashla", "run")), + output: + **wf.get_output_files("arcashla", "run"), + threads: wf.get_resource("arcashla", "run", "threads") + resources: + time=wf.get_resource("arcashla", "run", "time"), + memory=wf.get_resource("arcashla", "run", "memory"), + partition=wf.get_resource("arcashla", "run", "partition"), + tmpdir=wf.get_resource("arcashla", "run", "tmpdir"), + log: + wf.get_log_file("arcashla", "run"), + wrapper: + wf.wrapper_path("arcashla/run") From d43a11b69f3af7378401a28bb390014cb6f6969a Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Mon, 15 Jul 2024 14:02:15 +0200 Subject: [PATCH 19/23] Adding test for neoepitope prediction --- ...workflows_somatic_neoepitope_prediction.py | 212 +++++++++++++----- 1 file changed, 157 insertions(+), 55 deletions(-) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py index 057308663..9eb3cc435 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py @@ -5,6 +5,7 @@ import pytest import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards from snappy_pipeline.workflows.somatic_neoepitope_prediction import ( SomaticNeoepitopePredictionWorkflow, @@ -50,19 +51,38 @@ def minimal_config(): scalpel: path_target_regions: /path/to/target/regions.bed + hla_typing: + path_link_in: '' # OPTIONAL Override data set configuration search paths for FASTQ files + tools: [optitype] # REQUIRED - available: 'optitype' and 'arcashla' + optitype: + max_reads: 5000 + num_mapping_threads: 4 + somatic_variant_annotation: path_somatic_variant_calling: ../somatic_variant_calling tools: ["vep"] vep: - cache_dir: /path/to/dir/cache - + cache_dir: /path/to/dir/cache + species: homo_sapiens_merged + assembly: GRCh38 + cache_version: '102' + tx_flag: gencode_basic + num_threads: 8 + buffer_size: 500 + output_options: + - everything somatic_neoepitope_prediction: - path_somatic_variant_annotation: ../somatic_variant_annotation + path_somatic_variant_annotation: ../somatic_variant_annotation # REQUIRED + path_container: ../somatic_neoepitope_prediction/work/containers/out/pvactools.simg path_rna_ngs_mapping: ../ngs_mapping - tools_somatic_variant_annotation: ["vep"] + path_hla_typing: ../hla_typing + tools_somatic_variant_annotation: [vep] preparation: - format: 'snappy_custom' - mode: 'gene' + format: snappy_custom # REQUIRED - The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom) + mode: gene # REQUIRED - Determine whether the expression file contains gene or transcript TPM values. + prediction: + algorithms: ['MHCflurry','MHCnuggetsI'] + data_sets: first_batch: file: sheet.tsv @@ -96,6 +116,7 @@ def somatic_neoepitope_prediction_workflow( dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x, + "hla_typing": lambda x: "HLA_TYPING/" + x, } # Construct the workflow object return SomaticNeoepitopePredictionWorkflow( @@ -107,58 +128,146 @@ def somatic_neoepitope_prediction_workflow( ) -def test_neoepitope_preparation_step_part_get_input_files(somatic_neoepitope_prediction_workflow): +def test_somatic_neoepitope_preparation_step_part_get_input_files( + somatic_neoepitope_prediction_workflow, +): + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "var_caller": "mutect2", + "anno_caller": "vep", + "tumor_library": "P001-T1-DNA1-WGS1", + } + ) # Define expected vcf_base_out = ( "SOMATIC_VARIANT_ANNOTATION/output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" ) - ngs_mapping_base_out = "NGS_MAPPING/output/star.P002-T2-RNA1-mRNA_seq1/out/" + # vcf_base_out = ( + # "SOMATIC_VARIANT_ANNOTATION/output/bwa.mutect2.vep.P001-T1-DNA1-WES1/out/" + # "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" + # ) + ngs_mapping_base_out = "NGS_MAPPING/output/star.P001-T1-RNA1-mRNA_seq1/out/" expected = { "vcf": vcf_base_out + ".full.vcf.gz", "vcf_tbi": vcf_base_out + ".full.vcf.gz.tbi", - "expression": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.GeneCounts.tab", - "bam": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam", - "bai": ngs_mapping_base_out + "star.P002-T2-RNA1-mRNA_seq1.bam.bai", + "expression": ngs_mapping_base_out + "star.P001-T1-RNA1-mRNA_seq1.GeneCounts.tab", + "bam": ngs_mapping_base_out + "star.P001-T1-RNA1-mRNA_seq1.bam", + } + + # Get actual + actual = somatic_neoepitope_prediction_workflow.get_input_files("pvacseq", "prepare")(wildcards) + assert actual == expected + + +def test_somatic_neoepitope_prediction_step_part_get_input_files( + somatic_neoepitope_prediction_workflow, +): + # Define expected + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "var_caller": "mutect2", + "anno_caller": "vep", + "tumor_library": "P001-T1-DNA1-WGS1", + } + ) + vcf_base_out = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}" + ) + hla_typing_base_out = "HLA_TYPING/output/" + expected = { + "container": "work/containers/out/pvactools.simg", + "combine_vcf": vcf_base_out + ".vcf.gz", + "hla_normal_dna": hla_typing_base_out + + "optitype.P001-N1-DNA1-WGS1/out/optitype.P001-N1-DNA1-WGS1.txt", + "hla_tumor_dna": hla_typing_base_out + + "optitype.P001-T1-DNA1-WGS1/out/optitype.P001-T1-DNA1-WGS1.txt", + "hla_tumor_rna": hla_typing_base_out + + "optitype.P001-T1-RNA1-mRNA_seq1/out/optitype.P001-T1-RNA1-mRNA_seq1.txt", } # Get actual - actual = somatic_neoepitope_prediction_workflow.get_input_files("neoepitope_preparation", "run") + actual = somatic_neoepitope_prediction_workflow.get_input_files("pvacseq", "predict")(wildcards) assert actual == expected -def test_neoepitope_preparation_step_part_get_output_files(somatic_neoepitope_prediction_workflow): +def test_somatic_neoepitope_preparation_step_part_get_output_files( + somatic_neoepitope_prediction_workflow, +): base_out = ( - "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) expected = get_expected_output_vcf_files_dict(base_out) - actual = somatic_neoepitope_prediction_workflow.get_output_files( - "neoepitope_preparation", "run" - ) + actual = somatic_neoepitope_prediction_workflow.get_output_files("pvacseq", "prepare") assert actual == expected -def test_neoepitope_preparation_step_part_get_log_files(somatic_neoepitope_prediction_workflow): +def test_somatic_neoepitope_preparation_step_part_get_log_files( + somatic_neoepitope_prediction_workflow, +): base_out = ( - "work/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" ) expected = get_expected_log_files_dict(base_out=base_out) - actual = somatic_neoepitope_prediction_workflow.get_log_file("neoepitope_preparation", "run") + actual = somatic_neoepitope_prediction_workflow.get_log_file("pvacseq", "prepare") assert actual == expected -def test_neoepitope_preparation_step_part_get_resource( +def test_somatic_neoepitope_preparation_step_part_get_resource( somatic_neoepitope_prediction_workflow, ): # Define expected - expected_dict = {"threads": 2, "time": "1:00:00", "memory": "6144M"} + expected_dict = {"threads": 1, "time": "01:00:00", "memory": "6G"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." actual = somatic_neoepitope_prediction_workflow.get_resource( - "neoepitope_preparation", "run", resource + "pvacseq", "prepare", resource + )() + assert actual == expected, msg_error + + +def test_somatic_neoepitope_prediction_step_part_get_output_files( + somatic_neoepitope_prediction_workflow, +): + base_out = "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/out/MHC_Class_I/{tumor_library}" + expected = { + "all_epitopes": base_out + ".all_epitopes.tsv", + "all_epitopes_md5": base_out + ".all_epitopes.tsv.md5", + "filtered_epitopes": base_out + ".filtered.tsv", + "filtered_epitopes_md5": base_out + ".filtered.tsv.md5", + } + actual = somatic_neoepitope_prediction_workflow.get_output_files("pvacseq", "predict") + assert actual == expected + + +def test_somatic_neoepitope_prediction_step_part_get_log_files( + somatic_neoepitope_prediction_workflow, +): + base_out = ( + "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/" + "log/{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = somatic_neoepitope_prediction_workflow.get_log_file("pvacseq", "predict") + assert actual == expected + + +def test_somatic_neoepitope_prediction_step_part_get_resource( + somatic_neoepitope_prediction_workflow, +): + # Define expected + expected_dict = {"threads": 4, "time": "4:00:00", "memory": "30G"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_neoepitope_prediction_workflow.get_resource( + "pvacseq", "predict", resource )() assert actual == expected, msg_error @@ -166,54 +275,47 @@ def test_neoepitope_preparation_step_part_get_resource( def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["link_out", "neoepitope_preparation"] + expected = ["link_out", "pvacseq"] actual = list(sorted(somatic_neoepitope_prediction_workflow.sub_steps.keys())) assert actual == expected # Check result file construction tpl = ( - "output/{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1/{dir_}/" - "{mapper}.{var_caller}.{anno_caller}.GX.P00{i}-T{t}-DNA1-WGS1.{ext}" + "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.P00{i}-T{t}-DNA1-WGS1.epitopes/" + "out/MHC_Class_I/P00{i}-T{t}-DNA1-WGS1{ext}" + ) + log_tpl = ( + "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.P00{i}-T{t}-DNA1-WGS1.epitopes/" + "log/P00{i}-T{t}-DNA1-WGS1{ext}" ) expected = [ - tpl.format( - mapper=mapper, - var_caller=var_caller, - anno_caller=anno_caller, - i=i, - t=t, - ext=ext, - dir_="out", + log_tpl.format( + mapper="bwa", var_caller=var_caller, anno_caller="vep", i=i, t=t, ext=ext, mode="GX" ) - for i, t in ((1, 1), (2, 2)) - for ext in ("vcf.gz", "vcf.gz.tbi", "vcf.gz.md5", "vcf.gz.tbi.md5") - for mapper in ("bwa",) + for (i, t) in ((1, 1), (2, 2)) for var_caller in ("mutect2", "scalpel") - for anno_caller in ("vep",) + for ext in ( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ) ] expected += [ tpl.format( - mapper=mapper, - var_caller=var_caller, - anno_caller=anno_caller, - i=i, - t=t, - ext=ext, - dir_="log", + mapper="bwa", var_caller=var_caller, anno_caller="vep", i=i, t=t, ext=ext, mode="GX" ) - for i, t in ((1, 1), (2, 2)) + for (i, t) in ((1, 1), (2, 2)) + for var_caller in ("mutect2", "scalpel") for ext in ( - "conda_info.txt", - "conda_list.txt", - "log", - "conda_info.txt.md5", - "conda_list.txt.md5", - "log.md5", + ".all_epitopes.tsv", + ".filtered.tsv", + ".all_epitopes.tsv.md5", + ".filtered.tsv.md5", ) - for mapper in ("bwa",) - for var_caller in ("mutect2", "scalpel") - for anno_caller in ("vep",) ] expected = list(sorted(expected)) actual = list(sorted(somatic_neoepitope_prediction_workflow.get_result_files())) From 73e08c6dce538bb03277c2e7a58f03ef5f61a2df Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 17 Jul 2024 11:48:51 +0200 Subject: [PATCH 20/23] Adding pvacseq pipeline --- .../somatic_neoepitope_prediction/Snakefile | 58 +++- .../somatic_neoepitope_prediction/__init__.py | 280 +++++++++++++----- .../somatic_neoepitope_prediction/model.py | 72 ++++- .../pvactools/pvacseq/environment.yaml | 15 + .../wrappers/pvactools/pvacseq/wrapper.py | 159 ++++++++++ 5 files changed, 491 insertions(+), 93 deletions(-) create mode 100644 snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile index de398ddc7..7b18f7238 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -26,7 +26,7 @@ wf = SomaticNeoepitopePredictionWorkflow(workflow, config, lookup_paths, config_ localrules: # Linking files from work/ to output/ should be done locally - neoepitope_preparation_link_out_run, + somatic_neoepitope_preparation_link_out_run, rule all: @@ -37,7 +37,7 @@ rule all: # Generic linking out --------------------------------------------------------- -rule neoepitope_preparation_link_out_run: +rule somatic_neoepitope_preparation_link_out_run: input: wf.get_input_files("link_out", "run"), output: @@ -46,20 +46,54 @@ rule neoepitope_preparation_link_out_run: shell(wf.get_shell_cmd("link_out", "run", wildcards)) -rule neoepitope_preparation: +rule somatic_neoepitope_prediction_pvactools_install: + output: + **wf.get_output_files("pvacseq", "install"), + params: + container="docker://griffithlab/pvactools", + resources: + time=wf.get_resource("pvacseq", "install", "time"), + memory=wf.get_resource("pvacseq", "install", "memory"), + partition=wf.get_resource("pvacseq", "install", "partition"), + log: + **wf.get_log_file("pvacseq", "install"), + wrapper: + wf.wrapper_path("singularity") + + +rule somatic_neoepitope_preparation: input: - unpack(wf.get_input_files("pvacseq","prepare")), + unpack(wf.get_input_files("pvacseq", "prepare")), output: - **wf.get_output_files("pvacseq","prepare"), + **wf.get_output_files("pvacseq", "prepare"), log: - **wf.get_log_file("pvacseq","prepare"), - threads: wf.get_resource("pvacseq","prepare", "threads") + **wf.get_log_file("pvacseq", "prepare"), + threads: wf.get_resource("pvacseq", "prepare", "threads") resources: - time=wf.get_resource("pvacseq","prepare", "time"), - memory=wf.get_resource("pvacseq","prepare", "memory"), - partition=wf.get_resource("pvacseq","prepare", "partition"), - tmpdir=wf.get_resource("pvacseq","prepare", "tmpdir"), + time=wf.get_resource("pvacseq", "prepare", "time"), + memory=wf.get_resource("pvacseq", "prepare", "memory"), + partition=wf.get_resource("pvacseq", "prepare", "partition"), + tmpdir=wf.get_resource("pvacseq", "prepare", "tmpdir"), params: - **{"args": wf.get_params("pvacseq","prepare")}, + **{"args": wf.get_params("pvacseq", "prepare")}, wrapper: wf.wrapper_path("pvactools/combining") + + +rule somatic_neoepitope_prediction: + input: + unpack(wf.get_input_files("pvacseq", "predict")), + output: + **wf.get_output_files("pvacseq", "predict"), + log: + **wf.get_log_file("pvacseq", "predict"), + threads: wf.get_resource("pvacseq", "predict", "threads") + resources: + time=wf.get_resource("pvacseq", "predict", "time"), + memory=wf.get_resource("pvacseq", "predict", "memory"), + partition=wf.get_resource("pvacseq", "predict", "partition"), + tmpdir=wf.get_resource("pvacseq", "predict", "tmpdir"), + params: + **{"args": wf.get_params("pvacseq", "predict")}, + wrapper: + wf.wrapper_path("pvactools/pvacseq") diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index cfb3bb82c..7e485d0fe 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -46,6 +46,7 @@ SOMATIC_VARIANT_CALLERS_MATCHED, SomaticVariantCallingWorkflow, ) +from snappy_pipeline.workflows.hla_typing import HlaTypingWorkflow from .model import SomaticNeoepitopePrediction as SomaticNeoepitopePredictionConfigModel @@ -53,8 +54,13 @@ __email__ = "pham.gia-cuong@bih-charite.de" #: Extensions of files to create as main payload -EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") - +PREPARE_EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") +PREDICT_EXT_VALUES = ( + ".all_epitopes.tsv", + ".filtered.tsv", + ".all_epitopes.tsv.md5", + ".filtered.tsv.md5", +) #: Default configuration for the somatic_gene_fusion_calling step DEFAULT_CONFIG = SomaticNeoepitopePredictionConfigModel.default_config_yaml_string() @@ -66,7 +72,28 @@ class PvacSeqStepPart(BaseStepPart): #: Step name name = "pvacseq" - actions = ("prepare","predict") + + #: Actions + actions = ("install", "prepare", "predict") + + #: Resources + resource_usage = { + "install": ResourceUsage( + threads=2, + time="03:00:00", + memory="6G", + ), + "prepare": ResourceUsage( + threads=1, + time="01:00:00", + memory="6G", + ), + "predict": ResourceUsage( + threads=4, + time="4:00:00", + memory="30G", + ), + } def __init__(self, parent): super().__init__(parent) @@ -84,35 +111,50 @@ def __init__(self, parent): def get_input_files(self, action): self._validate_action(action) - return self._get_input_files_prepare - - # @dictify - # def _get_input_files_predict(self,wildcards): - # prepare_tpl = ( - # "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/" - # "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.vcf.gz" - # ) - # yield "combine_vcf",prepare_tpl - # hla_typing = self.parent.sub_workflows["hla_typing"] - # hla_tpl = { - # "output/{hla_tool}.{ngs_library}/out/{hla_tool}.{ngs_library}.txt" - # } - # for sheet in filter(is_not_background, self.parent.shortcut_sheets): - # for sample_pair in sheet.all_sample_pairs: - # if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: - # yield "hla_normal_dna", hla_typing((hla_tpl).format( - # hla_tool=self.w_config.step_config["hla_typing"].tools[0], - # ngs_library=sample_pair.normal_sample.dna_ngs_library.name,) - # ) - # yield "hla_tumor_dna", hla_typing((hla_tpl).format( - # hla_tool=self.w_config.step_config["hla_typing"].tools[0], - # ngs_library=sample_pair.tumor_sample.dna_ngs_library.name,) - # ) - # yield "hla_tumor_rna", hla_typing((hla_tpl).format( - # hla_tool=self.w_config.step_config["hla_typing"].tools[0], - # ngs_library=sample_pair.tumor_sample.rna_ngs_library.name,) - # ) - # pass + # if action == "install": + # return {"container": "work/containers/out/pvactools.simg"} + if action == "prepare": + return self._get_input_files_prepare + if action == "predict": + return self._get_input_files_predict + + @dictify + def _get_input_files_predict(self, wildcards): + yield "container", "work/containers/out/pvactools.simg" + prepare_tpl = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.vcf.gz" + ) + yield "combine_vcf", prepare_tpl + hla_typing = self.parent.sub_workflows["hla_typing"] + hla_tpl = "output/optitype.{ngs_library}/out/optitype.{ngs_library}.txt" + for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: + yield ( + "hla_normal_dna", + hla_typing( + (hla_tpl).format( + ngs_library=sample_pair.normal_sample.dna_ngs_library.name, + ) + ), + ) + yield ( + "hla_tumor_dna", + hla_typing( + (hla_tpl).format( + ngs_library=sample_pair.tumor_sample.dna_ngs_library.name, + ) + ), + ) + yield ( + "hla_tumor_rna", + hla_typing( + (hla_tpl).format( + ngs_library=sample_pair.tumor_sample.rna_ngs_library.name, + ) + ), + ) @dictify def _get_input_files_prepare(self, wildcards): @@ -120,6 +162,7 @@ def _get_input_files_prepare(self, wildcards): # It only works with vep now. # Validate action # self._validate_action(action) + # yield "container","work/containers/out/pvactools.simg" tpl = ( "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" @@ -130,32 +173,42 @@ def _get_input_files_prepare(self, wildcards): vep_key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"] if self.config["preparation"]["format"] == "snappy_custom": - for key,ext in vep_key_ext.items(): - yield key,variant_annotation(tpl+ext) + for key, ext in vep_key_ext.items(): + yield key, variant_annotation(tpl + ext) ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - - star_key_ext = {"expression": ".GeneCounts.tab", - "bam": ".bam"} + star_key_ext = {"expression": ".GeneCounts.tab", "bam": ".bam"} for sheet in filter(is_not_background, self.parent.shortcut_sheets): for sample_pair in sheet.all_sample_pairs: if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: - rna_tpl = ( - "output/{mapper}.{rna_library}/out/{mapper}.{rna_library}" - ) - for key,ext in star_key_ext.items(): - yield key, ngs_mapping((rna_tpl + ext).format( - mapper=self.w_config.step_config["ngs_mapping"].tools.rna[0], - rna_library=sample_pair.tumor_sample.rna_ngs_library.name,) - ) - - + rna_tpl = "output/{mapper}.{rna_library}/out/{mapper}.{rna_library}" + for key, ext in star_key_ext.items(): + yield ( + key, + ngs_mapping( + (rna_tpl + ext).format( + mapper=self.w_config.step_config["ngs_mapping"].tools.rna[0], + rna_library=sample_pair.tumor_sample.rna_ngs_library.name, + ) + ), + ) @dictify def get_output_files(self, action): """Return output files""" # Validate action self._validate_action(action) + if action == "install": + yield "container", "work/containers/out/pvactools.simg" + if action == "prepare": + path_prepare = self._get_output_files_prepare() + yield from path_prepare.items() + if action == "predict": + path_predict = self._get_output_files_predict() + yield from path_predict.items() + + @dictify + def _get_output_files_prepare(self): if self.config["preparation"]["mode"] == "gene": prefix = ( "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" @@ -171,43 +224,68 @@ def get_output_files(self, action): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" + @dictify + def _get_output_files_predict(self): + key_ext = {"all_epitopes": ".all_epitopes.tsv", "filtered_epitopes": ".filtered.tsv"} + prefix = "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/out/MHC_Class_I/{tumor_library}" + for key, ext in key_ext.items(): + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + @dictify def _get_log_file(self, action): """Return mapping of log files.""" # Validate action self._validate_action(action) - if self.config["preparation"]["mode"] == "gene": - prefix = ( - "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" - "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + if action == "install": + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), ) - elif self.config["preparation"]["mode"] == "transcript": - prefix = ( - "work/preprare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" - "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" + for key, ext in key_ext: + yield key, "work/containers/log/pvactool_install" + ext + + if action == "prepare": + if self.config["preparation"]["mode"] == "gene": + prefix = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + elif self.config["preparation"]["mode"] == "transcript": + prefix = ( + "work/preprare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" + ) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), ) - key_ext = ( - ("log", ".log"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), - ) - for key, ext in key_ext: - yield key, prefix + ext + for key, ext in key_ext: + yield key, prefix + ext - def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: - self._validate_action(action) - mem_mb = 6 * 1024 # 6GB - return ResourceUsage( - threads=2, - time="1:00:00", # 1 hour - memory=f"{mem_mb}M", - ) + if action == "predict": + prefix = ( + "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/" + "log/{tumor_library}" + ) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext def get_params(self, action): self._validate_action(action) - return getattr(self,"_get_params_run") - - def _get_params_run(self,wildcards): + if action == "prepare": + return getattr(self, "_get_params_run_prepare") + if action == "predict": + return getattr(self, "_get_params_run_predict") + + def _get_params_run_prepare(self, wildcards): d = { "max_depth": self.config["preparation"]["max_depth"], "format": self.config["preparation"]["format"], @@ -218,6 +296,42 @@ def _get_params_run(self,wildcards): } return d + def _get_params_run_predict(self, wildcards): + d = { + "epitope_length": ",".join( + map(str, self.config["prediction"]["CLASS_I_EPITOPE_LENGTH"]) + ), + "algorithms": " ".join(self.config["prediction"]["algorithms"]), + "BINDING_THRESHOLD": self.config["prediction"]["BINDING_THRESHOLD"], + "percentile_threshold": self.config["prediction"]["percentile_threshold"], + "allele_specific_binding_thresholds": self.config["prediction"][ + "allele_specific_binding_thresholds" + ], + "aggregate_inclusion_binding_threshold": self.config["prediction"][ + "aggregate_inclusion_binding_threshold" + ], + "netmhc_stab": self.config["prediction"]["netmhc_stab"], + "NET_CHOP_THRESHOLD": self.config["prediction"]["NET_CHOP_THRESHOLD"], + "PROBLEMATIC_AMINO_ACIDS": self.config["prediction"]["PROBLEMATIC_AMINO_ACIDS"], + # 'run_reference_proteome_similarity': self.config["prediction"]["run_reference_proteome_similarity"], + "FAST_SIZE": self.config["prediction"]["FAST_SIZE"], + "exclude_NAs": self.config["prediction"]["exclude_NAs"], + "NORMAL_COV": self.config["prediction"]["NORMAL_COV"], + "TDNA_COV": self.config["prediction"]["TDNA_COV"], + "TRNA_COV": self.config["prediction"]["TRNA_COV"], + "NORMAL_VAF": self.config["prediction"]["NORMAL_VAF"], + "maximum_transcript_support_level": self.config["prediction"][ + "maximum_transcript_support_level" + ], + "pass_only": self.config["prediction"]["pass_only"], + "tumor_purity": self.config["prediction"]["tumor_purity"], + } + for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: + d["normal_sample"] = sample_pair.normal_sample.dna_ngs_library.name + return d + class SomaticNeoepitopePredictionWorkflow(BaseStep): """Perform neoepitope prediction workflow""" @@ -245,6 +359,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, NgsMappingWorkflow, + HlaTypingWorkflow, ), ) # Register sub step classes so the sub steps are available @@ -258,6 +373,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) "ngs_mapping", config["path_rna_ngs_mapping"], ) + self.register_sub_workflow( + "hla_typing", + config["path_hla_typing"], + ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here if not config.tools_ngs_mapping: config.tools_ngs_mapping = self.w_config["step_config"]["ngs_mapping"].tools.dna @@ -274,21 +393,22 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) @listify def get_result_files(self): + cfg_mode = self.config["preparation"]["mode"] callers = set(self.config["tools_somatic_variant_calling"]) anno_callers = set(self.config["tools_somatic_variant_annotation"]) - if self.config["preparation"]["mode"] == "gene": - name_pattern = "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library.name}" - elif self.config["preparation"]["mode"] == "transcript": - name_pattern = "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library.name}" + predict_tpl = "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library.name}.epitopes/" yield from self._yield_result_files_matched( - os.path.join("output/prepare", name_pattern, "out", name_pattern + "{ext}"), + predict_tpl + "out/MHC_Class_I/{tumor_library.name}" + "{ext}", mapper=self.config["tools_ngs_mapping"], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, - ext=EXT_VALUES, + mode="GX" if cfg_mode == "gene" else "TX", + ext=PREDICT_EXT_VALUES, ) + yield from self._yield_result_files_matched( - os.path.join("output/prepare", name_pattern, "log", name_pattern + "{ext}"), + predict_tpl + "log/{tumor_library.name}" + "{ext}", + mode="GX" if cfg_mode == "gene" else "TX", mapper=self.config["tools_ngs_mapping"], var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), anno_caller=anno_callers, diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py index 7abe6815f..6788a2bb4 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py @@ -32,13 +32,82 @@ class Preparation(SnappyModel): ignore_ensembl_id_version: bool = True """Ignore the ensemble id version""" max_depth: int = 4000 + "max_depth option of bcftools mpileup" + + +class Algorithms(enum.StrEnum): + BigMHC_EL = "BigMHC_EL" + BigMHC_IM = "BigMHC_IM" + DeepImmuno = "DeepImmuno" + MHCflurry = "MHCflurry" + MHCflurryEL = "MHCflurryEL" + MHCnuggetsI = "MHCnuggetsI" + MHCnuggetsII = "MHCnuggetsII" + NNalign = "NNalign" + NetMHC = "NetMHC" + NetMHCIIpan = "NetMHCIIpan" + NetMHCIIpanEL = "NetMHCIIpanEL" + NetMHCcons = "NetMHCcons" + NetMHCpan = "NetMHCpan" + NetMHCpanEL = "NetMHCpanEL" + PickPocket = "PickPocket" + SMM = "SMM" + SMMPMBEC = "SMMPMBEC" + SMMalign = "SMMalign" + all_class_i = "all_class_i" + + +class Prediction(SnappyModel): + CLASS_I_EPITOPE_LENGTH: list[int] = [8, 9, 10, 11] + "Length of MHC Class I subpeptides (neoepitopes) to predict." + "Multiple epitope lengths can be specified using a comma-separated list." + "Typical epitope lengths vary between 8-15." + algorithms: Annotated[list[Algorithms], EnumField(Algorithms, [])] + "The epitope prediction algorithms to use." + "For running with all prediction assign to ['all_class_i']" + # Following parameter please follow the documentation of pvactools to know its function + BINDING_THRESHOLD: int = 500 + percentile_threshold: float | None = Field(default=None) + allele_specific_binding_thresholds: bool = False + aggregate_inclusion_binding_threshold: int = 5000 + netmhc_stab: bool = False + NET_CHOP_THRESHOLD: float = 0.5 + PROBLEMATIC_AMINO_ACIDS: list[str] | None = Field(default=None) + # top-score-metric + # net-chop-method + # net-chop-threshold + + # E.g amino_acid:position - G:2 would check for a Glycine at + # the second position of the epitope + # run_reference_proteome_similarity: bool = False + # blastp_path + # blastp-db + # peptide-fasta + FAST_SIZE: int = 200 + exclude_NAs: bool = False + NORMAL_COV: int = 5 + TDNA_COV: int = 10 + TRNA_COV: int = 10 + NORMAL_VAF: float = 0.02 + maximum_transcript_support_level: int = Field(gt=0, lt=6, default=1) + # allele-specific-anchors + # anchor_contribution_threshold: float=0.8 + pass_only: bool = False + tumor_purity: float | None = Field(default=None) class SomaticNeoepitopePrediction(SnappyStepModel): path_somatic_variant_annotation: Annotated[ str, Field(examples=["../somatic_variant_annotation"]) ] - path_hla_typing : Annotated[str, Field(examples=["../hla_typing"])] + path_container: Annotated[ + str, Field(examples=["../somatic_neoepitope_prediction/work/containers/out/pvactools.simg"]) + ] + """ + Running somatic neoepitope prediction with pvactools + is required,with the container + """ + path_hla_typing: Annotated[str, Field(examples=["../hla_typing"])] path_rna_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])] tools_somatic_variant_annotation: list[str] = ["vep"] tools_ngs_mapping: list[str] = [] @@ -50,3 +119,4 @@ class SomaticNeoepitopePrediction(SnappyStepModel): tools_somatic_variant_calling: list[str] = [] """Deafult to those configured for somatic_variant_calling""" preparation: Preparation | None = None + prediction: Prediction | None = None diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml b/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml new file mode 100644 index 000000000..7ef7fd65f --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml @@ -0,0 +1,15 @@ +channels: + - conda-forge + - bioconda + - defaults + - default +dependencies: + - python==3.6.15 + # - pip==21.3.1 + # - pip: + # - pvactools==3.1.3 + # - mhcflurry==2.0.1 + # - mhcnuggets==2.3.3 + # - git+https://github.com/griffithlab/bigmhc.git#egg=bigmhc + # - git+https://github.com/griffithlab/deepimmuno.git#egg=deepimmuno + # - tensorflow==1.5.0 \ No newline at end of file diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py index e69de29bb..1b9c3cb9f 100644 --- a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py @@ -0,0 +1,159 @@ +import os + +from snakemake.shell import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +algorithms = snakemake.params.args["algorithms"] +epitope_length = snakemake.params.args["epitope_length"] +normal_smaple = snakemake.params.args["normal_sample"] +BINDING_THRESHOLD = f"-b {snakemake.params.args["BINDING_THRESHOLD"]} " +percentile_threshold = ( + f"--percentile-threshold {snakemake.params.args["percentile_threshold"]} " + if snakemake.params.args["percentile_threshold"] != None + else "" +) + + +allele_specific_binding_thresholds = ( + f"--allele-specific-binding-thresholds " + if snakemake.params.args["allele_specific_binding_thresholds"] + else "" +) + +aggregate_inclusion_binding_threshold = ( + f"--aggregate-inclusion-binding-threshold {snakemake.params.args["aggregate_inclusion_binding_threshold"]} ", +) + +netmhc_stab = f"--netmhc-stab " if snakemake.params.args["netmhc_stab"] else "" + +NET_CHOP_THRESHOLD = (f"--net-chop-threshold {snakemake.params.args["NET_CHOP_THRESHOLD"]} ",) + +PROBLEMATIC_AMINO_ACIDS = ( + f"--problematic-amino-acids {snakemake.params.args["PROBLEMATIC_AMINO_ACIDS"]} ", +) + +# if {snakemake.params.args["run_reference_proteome_similarity"]}: +# run_reference_proteome_similarity= f"--run-reference-proteome-similarity", + +FAST_SIZE = (f"-s {snakemake.params.args["FAST_SIZE"]} ",) + +exclude_NAs = "--exclude-NAs " if snakemake.params.args["exclude_NAs"] else "" + +NORMAL_COV = (f"--normal-cov {snakemake.params.args["NORMAL_COV"]} ",) +TDNA_COV = (f"--tdna-cov {snakemake.params.args["TDNA_COV"] }",) +TRNA_COV = (f"--trna-cov {snakemake.params.args["TRNA_COV"]} ",) +NORMAL_VAF = (f"--normal-vaf {snakemake.params.args["NORMAL_VAF"]} ",) + +maximum_transcript_support_level = ( + f"--maximum-transcript-support-level {snakemake.params.args["maximum_transcript_support_level"]} ", +) + +pass_only = "--pass-only " if snakemake.params.args["pass_only"] else "" + +tumor_purity = ( + f"--tumor-purity {snakemake.params.args["tumor_purity"]} " + if snakemake.params.args["tumor_purity"] != None + else "" +) + +op_dir = "/".join(snakemake.output.all_epitopes.split("/")[:-2]) + +files_to_bind = { + "combine_vcf": snakemake.input.combine_vcf, + "op_dir": op_dir, +} + +# TODO: Put the following in a function (decide where...) +# Replace with full absolute paths +files_to_bind = {k: os.path.realpath(v) for k, v in files_to_bind.items()} +# Directories that mut be bound +dirs_to_bind = {k: os.path.dirname(v) for k, v in files_to_bind.items()} +# List of unique directories to bind: on cluster: -> from container: /bindings/d) +bound_dirs = {e[1]: e[0] for e in enumerate(list(set(dirs_to_bind.values())))} +# Binding command +bindings = " ".join(["-B {}:/bindings/d{}".format(k, v) for k, v in bound_dirs.items()]) +bound_files = { + k: "/bindings/d{}/{}".format(bound_dirs[dirs_to_bind[k]], os.path.basename(v)) + for k, v in files_to_bind.items() +} + +shell.executable("/bin/bash") + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Setup auto-cleaned tmpdir +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Compute md5 checksum +md5() {{ + fn=$1 + d=$(dirname $fn) + f=$(basename $fn) + pushd $d 1> /dev/null 2>&1 + checksum=$(md5sum $f) + popd 1> /dev/null 2>&1 + echo "$checksum" +}} + +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +hla_types=$(cat {snakemake.input.hla_normal_dna} {snakemake.input.hla_tumor_dna} {snakemake.input.hla_tumor_rna} | sed 's/^/HLA-/' | sort | uniq | tr '\n' ',' | sed 's/,$//') +cmd="pvacseq run --normal-sample-name {normal_smaple} \ + -e1 {epitope_length} \ + --iedb-install-directory /opt/iedb \ + -t {snakemake.threads} \ + {bound_files[combine_vcf]} \ + {snakemake.wildcards.tumor_library} \ + $hla_types \ + {algorithms} \ + {bound_files[op_dir]} \ + {BINDING_THRESHOLD}\ + {percentile_threshold}\ + {allele_specific_binding_thresholds}\ + {aggregate_inclusion_binding_threshold}\ + {NET_CHOP_THRESHOLD}\ + {PROBLEMATIC_AMINO_ACIDS}\ + {FAST_SIZE}\ + {NORMAL_COV}\ + {TDNA_COV}\ + {TRNA_COV}\ + {NORMAL_VAF}\ + {exclude_NAs}\ + {maximum_transcript_support_level}" +echo 'TMPDIR=/bindings/d2' > $TMPDIR/{snakemake.wildcards.tumor_library}.sh +echo $cmd >> $TMPDIR/{snakemake.wildcards.tumor_library}.sh +apptainer exec --home $PWD -B $TMPDIR:/bindings/d2 {bindings} {config[path_container]} bash /bindings/d2/{snakemake.wildcards.tumor_library}.sh + +md5 {snakemake.output.all_epitopes} > {snakemake.output.all_epitopes_md5} +md5 {snakemake.output.filtered_epitopes} > {snakemake.output.filtered_epitopes_md5} +""" +) From 96702927838805364ee1e151fcd93cd8b2c42bb8 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 17 Jul 2024 12:09:05 +0200 Subject: [PATCH 21/23] make lint happy --- .../workflows/somatic_neoepitope_prediction/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py index 7e485d0fe..9721a32f1 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -32,7 +32,6 @@ """ from collections import OrderedDict -import os import sys from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background From 76e0c9735a4ea6676e5413503d7908e76d78e6c6 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 17 Jul 2024 12:19:00 +0200 Subject: [PATCH 22/23] Make linting satisfying --- .../workflows/somatic_neoepitope_prediction/model.py | 2 +- snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py index 6788a2bb4..a17452d03 100644 --- a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py @@ -104,7 +104,7 @@ class SomaticNeoepitopePrediction(SnappyStepModel): str, Field(examples=["../somatic_neoepitope_prediction/work/containers/out/pvactools.simg"]) ] """ - Running somatic neoepitope prediction with pvactools + Running somatic neoepitope prediction with pvactools is required,with the container """ path_hla_typing: Annotated[str, Field(examples=["../hla_typing"])] diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py index 1b9c3cb9f..ec4245fd1 100644 --- a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py +++ b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py @@ -1,6 +1,6 @@ import os -from snakemake.shell import shell +from snakemake import shell __author__ = "Pham Gia Cuong" __email__ = "pham.gia-cuong@bih-charite.de" @@ -14,13 +14,13 @@ BINDING_THRESHOLD = f"-b {snakemake.params.args["BINDING_THRESHOLD"]} " percentile_threshold = ( f"--percentile-threshold {snakemake.params.args["percentile_threshold"]} " - if snakemake.params.args["percentile_threshold"] != None + if snakemake.params.args["percentile_threshold"] is not None else "" ) allele_specific_binding_thresholds = ( - f"--allele-specific-binding-thresholds " + "--allele-specific-binding-thresholds " if snakemake.params.args["allele_specific_binding_thresholds"] else "" ) @@ -29,7 +29,7 @@ f"--aggregate-inclusion-binding-threshold {snakemake.params.args["aggregate_inclusion_binding_threshold"]} ", ) -netmhc_stab = f"--netmhc-stab " if snakemake.params.args["netmhc_stab"] else "" +netmhc_stab = "--netmhc-stab " if snakemake.params.args["netmhc_stab"] else "" NET_CHOP_THRESHOLD = (f"--net-chop-threshold {snakemake.params.args["NET_CHOP_THRESHOLD"]} ",) @@ -57,7 +57,7 @@ tumor_purity = ( f"--tumor-purity {snakemake.params.args["tumor_purity"]} " - if snakemake.params.args["tumor_purity"] != None + if snakemake.params.args["tumor_purity"] is not None else "" ) From 03c9714a9e8f642d225043bb013cad88ec487d75 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 17 Jul 2024 12:24:51 +0200 Subject: [PATCH 23/23] Reformat hla_typing snakefile --- snappy_pipeline/workflows/hla_typing/Snakefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/snappy_pipeline/workflows/hla_typing/Snakefile b/snappy_pipeline/workflows/hla_typing/Snakefile index 5b07da66d..3dc829c5a 100644 --- a/snappy_pipeline/workflows/hla_typing/Snakefile +++ b/snappy_pipeline/workflows/hla_typing/Snakefile @@ -53,6 +53,7 @@ rule hla_typing_link_in_run: # Generic linking out --------------------------------------------------------- + rule hla_typing_link_out_run: input: wf.get_input_files("link_out", "run"), @@ -67,6 +68,7 @@ rule hla_typing_link_out_run: # OptiType -------------------------------------------------------------------- if "optitype" in wf.w_config.step_config["hla_typing"].tools: + rule hla_typing_optitype_run: input: **wf.get_input_files("optitype", "run"), @@ -91,6 +93,7 @@ if "optitype" in wf.w_config.step_config["hla_typing"].tools: # NB: reference is updated in the installed package if "arcashla" in wf.w_config.step_config["hla_typing"].tools: + rule hla_typing_arcashla_prepare_reference: output: touch("work/arcashla.prepare_reference/out/.done"), @@ -99,7 +102,6 @@ if "arcashla" in wf.w_config.step_config["hla_typing"].tools: wrapper: wf.wrapper_path("arcashla/prepare_reference") - rule hla_typing_arcashla_run: input: unpack(wf.get_input_files("arcashla", "run")),