From ad07ebfc98e6a024b61724fa35151024eed9f9b7 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Sat, 24 Jun 2023 21:19:16 +0200 Subject: [PATCH 01/17] Adding wrapper for somatic variants QC --- .../somatic_variants_qc/enviroment.yaml | 8 + .../somatic_variants_qc/summarize-vcf.py | 265 ++++++++++++++++++ .../wrappers/somatic_variants_qc/wrapper.py | 40 +++ 3 files changed, 313 insertions(+) create mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml create mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py create mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml b/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml new file mode 100644 index 000000000..beb5b1f3f --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda +dependencies: + - cyvcf2 + - pip=20.0.2 + - pip: + - pytabix diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py new file mode 100644 index 000000000..fb8ad450f --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py @@ -0,0 +1,265 @@ +# import packages + +import json # for writing out the json file +from cyvcf2 import VCF # for reading vcf file +import tabix +from optparse import OptionParser + + +parser = OptionParser() +parser.add_option("--rawvcf", help="raw vcf file") +parser.add_option("--filtered-vcf", help="filtered vcf file") +parser.add_option( + "--exom-bedfile", + help="exom regions bed file", +) +parser.add_option("--padding", help="it's padding", default=0) +parser.add_option( + "--repeated-region-bedfile", + help="bed file contains repeated or difficult to map regions", +) +parser.add_option( + "--output", + help="json file contains information about vcf file", +) +(options, args) = parser.parse_args() + +# from pybedtools import Interval,BedTool + +# I will use for loop here +# It could be better by using multi threading. +# TODO: +# - Number of variants called (from the `*.full.vcf.gz`file) [X] Get all variants from full file +# - Number of variants that pass all filters [X] Get all variants from fitlered file +# - Number of variants passing the filters that are inside & outside of the exome bed file (with padding) [X] +# - Number of variants in repeated or other difficult to map regions [ ] Need a bed file containing repeated +# - Number of SNVs & indels [X] +# - Indel lengths statistics [X] +# - Number of variants with more than 1 ALT allele (I _think_ that this value should be 0 for all variants passing the filters) [ ] +# - Number of SNVs with minimal support (only one read supporting the variant), possibly separating inside & outside exome bed [X] +# - Number of SNVs with limited support (5 reads or less supporting the variant), possibly separating inside & outside exome bed [X] +# - Number of SNVs with strong support (at least 10 reads, VAF greater or equal to 10%) [X] VAF need to be joined +# - Number of SNVs in each mutation class (C>A, C>G, C>T, T>A, T>C, T>G), for minimal, limited, average & strong support [X] not for minimal,limited,average yet +# - Metrics on strand bias could also be collected (F1R2 & F2R1 vcf FORMAT) [ ] PAUSE +# - the VAF distribution. Perhaps we can split the variants (both SNVs & indels) into those with very little support (<1%), subclonal (<10%), the main part (<40%), affected by CNV (>40%)[ ] + + +# Adding A>C G>T grouping +# Table for VAF +# Genotype check > report different genotypes +# Checking number multi allelic +# Support alternative allel in the normal +# #alt in normal + average ALT + max ALT +# snappy pipeline for this + +##### SNAPPY PIPELINE +## WRAPPER +# - Checking the configuration file +# - Checking input file +## PIPELINE +# - __init__ : + + +def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0): + # print(f'variant chrom: {v_chrom} - start: {v_start} - end:{v_end}') + record = bed_intervals.query(v_chrom, v_start - padding, v_end + padding) + if record: + return True + else: + return False + + +# def calculate_exoms_length(bed_intervals): +# contigs= ["chr" + str(i) for i in range(1, 23)] +# contigs.append("chrX") +# contigs.append("chrY") +# length = 0 +# for contig in contigs: +# records = bed_intervals.query(contig,0,250000000) +# for record in records: +# length = length + int(record[2]) - int(record[1]) +# return length + + +def get_variant_type(ref, alt): + if (len(alt) == 1) and (len(ref) == len(alt)): + return "snp" + elif len(alt) != len(ref): + return "indels" + + +def check_sp_read(variant): + if get_variant_type(variant.REF, variant.ALT[0]) == "snp": + dp = variant.INFO["DP"] + if dp == 1: + return "minimal" + elif (dp > 1) and (dp <= 5): + return "limited" + if dp >= 10: + return "strong" + + +def assign_class_snvs(variant, mt_mat): + temp = str(variant.REF) + ">" + str(variant.ALT[0]) + # Following result from plot-vcfstats with bcftools + match temp: + case "A>C" | "C>A": + mt_mat[0] += 1 + case "A>G" | "G>A": + mt_mat[1] += 1 + case "A>T" | "T>A": + mt_mat[2] += 1 + case "C>G" | "G>C": + mt_mat[3] += 1 + case "C>T" | "T>C": + mt_mat[4] += 1 + case "G>T" | "T>G": + mt_mat[5] += 1 + return mt_mat + + +def process_vcf_file(vcf_file, bed_file="", filter_file=False, padding=0): + total_v_count = 0 + if filter_file: + # VAF + vaf = [] + # numb of variants in and out of exoms + v_inside_exom = 0 + v_outside_exom = 0 + # numb of snps and indels + n_snps = 0 + n_indels = 0 + indels_length = [] + # support read informations + minimal_rp_snvs_exom = 0 + minimal_rp_snvs_nexom = 0 + limited_rp_snvs_exom = 0 + limited_rp_snvs_nexom = 0 + strong_rp_snvs_exom = 0 + strong_rp_snvs_nexom = 0 + # mutation classes + mt_classes = [0] * 6 + for variant in vcf_file: + vaf.append(variant.format("AF")[1]) + total_v_count += 1 + # Counting variants in or out of exom and variants with different support reads + if "chrUn" in variant.CHROM: + v_outside_exom += 1 + if check_sp_read(variant) == "minimal": + minimal_rp_snvs_nexom += 1 + elif check_sp_read(variant) == "limited": + limited_rp_snvs_nexom += 1 + else: + strong_rp_snvs_nexom += 1 + elif check_variant_in_bed( + variant.CHROM, variant.start, variant.end, bed_file, padding=0 + ): + v_inside_exom += 1 + if check_sp_read(variant) == "minimal": + minimal_rp_snvs_exom += 1 + elif check_sp_read(variant) == "limited": + limited_rp_snvs_exom += 1 + else: + strong_rp_snvs_exom += 1 + + else: + v_outside_exom += 1 + if check_sp_read(variant) == "minimal": + minimal_rp_snvs_nexom += 1 + elif check_sp_read(variant) == "limited": + limited_rp_snvs_nexom += 1 + else: + strong_rp_snvs_nexom += 1 + # get number of snvs and indels + # Need to check multi allelic. Users shouldn't input multi allelic vcf file. + if get_variant_type(variant.REF, variant.ALT[0]) == "snp": + n_snps += 1 + mt_classes = assign_class_snvs(variant, mt_classes) + else: + # More for indels + n_indels += 1 + indels_length.append(abs(len(variant.REF) - len(variant.ALT[0]))) + + return ( + total_v_count, + v_inside_exom, + v_outside_exom, + n_snps, + n_indels, + indels_length, + minimal_rp_snvs_exom, + minimal_rp_snvs_nexom, + limited_rp_snvs_exom, + limited_rp_snvs_nexom, + strong_rp_snvs_exom, + strong_rp_snvs_nexom, + mt_classes, + vaf, + ) + else: + for variant in vcf_file: + total_v_count += 1 + return total_v_count + + +def main(): + classes = ["A>C", "A>G", "A>T", "C>G", "C>T", "G>T"] + path_full_vcf = options.rawvcf + path_filtered_vcf = options.filtered_vcf + path_bedfile = options.exom_bedfile + outpath = options.output + full_vcf = VCF(path_full_vcf) + filter_vcf = VCF(path_filtered_vcf) + # header = filter_vcf.raw_header + sample = filter_vcf.samples[1] + # read bed file + bed_intervals = tabix.open(path_bedfile) + full_v_count = process_vcf_file(full_vcf) + ( + passed_v_count, + v_inside_exom, + v_outside_exom, + n_snps, + n_indels, + indels_length, + minimal_rp_snvs_exom, + minimal_rp_snvs_nexom, + limited_rp_snvs_exom, + limited_rp_snvs_nexom, + strong_rp_snvs_exom, + strong_rp_snvs_nexom, + mt_classes, + vaf, + ) = process_vcf_file(filter_vcf, bed_intervals, filter_file=True, padding=options.padding) + + # After one for loop through vcf file. The VCF will automatically be closed. + # length_exoms = calculate_exoms_length(bed_intervals) + # for variant in filter_vcf: + # print(variant.format("AF")) + summary = { + "sample": sample, + "number_full_variants": full_v_count, + "number_passed_variants": passed_v_count, + "number_snvs_inside_exom": v_inside_exom, + "number_snvs_outside_exom": v_outside_exom, + "number_snps": n_snps, + "number_indels": n_indels, + "indels_length": indels_length, + "minimal_rp_snvs_exom": minimal_rp_snvs_exom, + "minimal_rp_snvs_nexom": minimal_rp_snvs_nexom, + "limited_rp_snvs_exom": limited_rp_snvs_exom, + "limited_rp_snvs_nexom": limited_rp_snvs_nexom, + "strong_rp_snvs_exom": strong_rp_snvs_exom, + "strong_rp_snvs_nexom": strong_rp_snvs_nexom, + "mutation_classes": mt_classes, + "classes": classes, + "VAF": vaf, + } + json_object = json.dumps(summary, indent=4) + + with open(outpath, "w") as outfile: + outfile.write(json_object) + + +if __name__ == "__main__": + main() diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py new file mode 100644 index 000000000..197240cdf --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +"""Wrapper for summarize information of vcf file after +somatic variant calling step +""" +from snakemake.shell import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +exec 2> >(tee -a "{snakemake.log.log}") +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +python summarize-vcf.py --rawvcf {snakemake.input.raw_vcf} \ + --filtered-vcf {snakemake.input.passed_vcf} \ + --exom-bedfile {snakemake.config[step_config][somatic_variants_QC][target_regions]} \ + --padding {snakemake.config[step_config][somatic_variants_QC][padding]} \ + --output {snakemake.output.json} + +pushd $(dirname {snakemake.output.json}) +md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5}) + """ +) + +# Compute MD5 sums of logs +shell( + r""" +md5sum {snakemake.log.log} > {snakemake.log.log_md5} +md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5} +""" +) From 94cda2782217ad0948962358a3731772c36934ac Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Sun, 25 Jun 2023 20:36:27 +0200 Subject: [PATCH 02/17] Adding somatic variant qc pipeline --- snappy_pipeline/apps/snappy_snake.py | 2 + .../workflows/somatic_variant_qc/Snakefile | 64 +++++ .../workflows/somatic_variant_qc/__init__.py | 223 ++++++++++++++++++ .../tumor_mutational_burden/Snakefile | 3 +- .../wrappers/bcftools/TMB/wrapper.py | 2 +- .../{enviroment.yaml => environment.yaml} | 0 .../somatic_variants_qc/summarize-vcf.py | 84 +++---- .../wrappers/somatic_variants_qc/wrapper.py | 8 +- 8 files changed, 341 insertions(+), 45 deletions(-) create mode 100644 snappy_pipeline/workflows/somatic_variant_qc/Snakefile create mode 100644 snappy_pipeline/workflows/somatic_variant_qc/__init__.py rename snappy_wrappers/wrappers/somatic_variants_qc/{enviroment.yaml => environment.yaml} (100%) diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index c47f50b94..9348adb36 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -44,6 +44,7 @@ somatic_variant_expression, somatic_variant_filtration, somatic_variant_signatures, + somatic_variant_qc, somatic_wgs_cnv_calling, somatic_wgs_sv_calling, sv_calling_targeted, @@ -99,6 +100,7 @@ "somatic_variant_expression": somatic_variant_expression, "somatic_variant_filtration": somatic_variant_filtration, "somatic_variant_signatures": somatic_variant_signatures, + "somatic_variant_qc": somatic_variant_qc, "somatic_wgs_cnv_calling": somatic_wgs_cnv_calling, "somatic_wgs_sv_calling": somatic_wgs_sv_calling, "sv_calling_targeted": sv_calling_targeted, diff --git a/snappy_pipeline/workflows/somatic_variant_qc/Snakefile b/snappy_pipeline/workflows/somatic_variant_qc/Snakefile new file mode 100644 index 000000000..2ac7ab432 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_qc/Snakefile @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline tumor mutational burden step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_variant_qc import ( + SomaticVariantQCWorkflow, +) + +__author__ = "Gia Cuong Pham" +__email__ = "pham.gia-cuong@bih-charite.de" + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== +wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + + +localrules: + # Linking files from work/ to output/ should be done locally + somatic_variant_qc_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# Generic linking out --------------------------------------------------------- + + +rule somatic_variant_qc_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +rule somatic_variant_qc: + input: + **wf.get_input_files("SVQC_step", "run"), + output: + **wf.get_output_files("SVQC_step", "run"), + threads: wf.get_resource("SVQC_step", "run", "threads") + resources: + time=wf.get_resource("SVQC_step", "run", "time"), + memory=wf.get_resource("SVQC_step", "run", "memory"), + partition=wf.get_resource("SVQC_step", "run", "partition"), + tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"), + log: + **wf.get_log_file("SVQC_step", "run"), + wrapper: + wf.wrapper_path("somatic_variants_qc") diff --git a/snappy_pipeline/workflows/somatic_variant_qc/__init__.py b/snappy_pipeline/workflows/somatic_variant_qc/__init__.py new file mode 100644 index 000000000..5ce9c4349 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_qc/__init__.py @@ -0,0 +1,223 @@ +from collections import OrderedDict +import os +import sys + +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import expand + +from snappy_pipeline.base import UnsupportedActionException +from snappy_pipeline.utils import dictify, listify +from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from snappy_pipeline.workflows.somatic_variant_calling import ( + SOMATIC_VARIANT_CALLERS_MATCHED, + SomaticVariantCallingWorkflow, +) + +#: Extensions of files to create as main payload +EXT_VALUES = (".json", ".json.md5") + +#: Names of the files to create for the extension +EXT_NAMES = ("json", "json_md5") + +#: Default configuration for the tmb calculation step +DEFAULT_CONFIG = r""" +step_config: + somatic_variant_qc: + path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED + tools_ngs_mapping: [] # default to those configured for ngs_mapping + tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling + target_regions: # REQUIRED + padding: 0 #Used for count the number of variants outside of exom + padding + repeated_regions: [] #need to confirm +""" + + +class SomaticVariantQCStepPart(BaseStepPart): + """""" + + name = "SVQC_step" + + actions = ("run",) + + def __init__(self, parent): + super().__init__(parent) + self.tumor_ngs_library_to_sample_pair = OrderedDict() + for sheet in self.parent.shortcut_sheets: + # update function of OrderedDict + self.tumor_ngs_library_to_sample_pair.update( + sheet.all_sample_pairs_by_tumor_dna_ngs_library + ) + # Build mapping from donor name to donor. + self.donors = OrderedDict() + for sheet in self.parent.shortcut_sheets: + for donor in sheet.donors: + self.donors[donor.name] = donor + + @dictify + def get_input_files(self, action): + self._validate_action(action) + tpl = ( + "output/{mapper}.{var_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{tumor_library}" + ) + key_ext = { + "full_vcf": ".full.vcf.gz", + "full_vcf_tbi": ".full.vcf.gz.tbi", + "passed_vcf": ".vcf.gz", + "passed_vcf_tbi": ".vcf.gz.tbi", + } + variant_calling = self.parent.sub_workflows["somatic_variant_calling"] + for key, ext in key_ext.items(): + yield key, variant_calling(tpl + ext) + + @dictify + def get_output_files(self, action): + # Validate action + self._validate_action(action) + prefix = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + key_ext = {"json": ".json"} + for key, ext in key_ext.items(): + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + @dictify + def _get_log_file(self, action): + assert action == "run" + prefix = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + + def get_resource_usage(self, action): + """Get Resource Usage + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + :return: Returns ResourceUsage for step. + :raises UnsupportedActionException: if action not in class defined list of valid actions. + """ + if action not in self.actions: + actions_str = ", ".join(self.actions) + error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" + raise UnsupportedActionException(error_message) + mem_mb = 4 * 1024 # 4GB + return ResourceUsage( + threads=4, + time="1:00:00", # 1 hour + memory=f"{mem_mb}M", + ) + + +class SomaticVariantQCWorkflow(BaseStep): + """Perform gathering variant information""" + + name = "somaticvariantqc" + sheet_shortcut_class = CancerCaseSheet + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one.""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + (SomaticVariantCallingWorkflow, NgsMappingWorkflow), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart)) + # Register sub workflows + self.register_sub_workflow( + "somatic_variant_calling", + self.w_config["step_config"]["somatic_variant_qc"]["path_somatic_variant_calling"], + ) + # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here + if not self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"]: + self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"] = self.w_config[ + "step_config" + ]["ngs_mapping"]["tools"]["dna"] + if not self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]: + self.w_config["step_config"]["somatic_variant_qc"][ + "tools_somatic_variant_calling" + ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] + + @listify + def get_result_files(self): + callers = set( + self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"] + ) + name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}" + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=EXT_VALUES, + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) + + def _yield_result_files_matched(self, tpl, **kwargs): + """Build output paths from path template and extension list. + + This function returns the results from the matched somatic variant callers such as + Mutect. + """ + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + yield from expand( + tpl, + tumor_library=[sample_pair.tumor_sample.dna_ngs_library], + **kwargs, + ) + + def check_config(self): + """Check that the path to the NGS mapping is present""" + self.ensure_w_config( + ("step_config", "somatic_variant_qc", "path_somatic_variant_calling"), + "Path to variant calling not configured but required for somatic variant qc", + ) + + self.ensure_w_config( + ("step_config", "somatic_variant_qc", "target_regions"), + "Path to target_regions file (bed format)" + "not configured but required for somatic variant qc", + ) diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile index 27da7d775..3b3a2ee2b 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile +++ b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile @@ -8,7 +8,8 @@ from snappy_pipeline.workflows.tumor_mutational_burden import ( TumorMutationalBurdenCalculationWorkflow, ) -__author__ = "" +__author__ = "Gia Cuong Pham" +__email__ = "pham.gia-cuong@bih-charite.de" # Configuration =============================================================== diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py index 6bd7a88ba..e06829437 100644 --- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py @@ -46,10 +46,10 @@ "Number_snvs": $number_snvs, "Number_indels": $number_indels, "Total_regions_length": $total_exom_length +} EOF pushd $(dirname {snakemake.output.json}) md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5}) -} """ ) diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml b/snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml similarity index 100% rename from snappy_wrappers/wrappers/somatic_variants_qc/enviroment.yaml rename to snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py index fb8ad450f..19a3e9087 100644 --- a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py @@ -59,6 +59,10 @@ ## PIPELINE # - __init__ : +CONTIG = ["chr" + str(i) for i in range(1, 22)] +CONTIG.append("chrX") +CONTIG.append("chrY") + def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0): # print(f'variant chrom: {v_chrom} - start: {v_start} - end:{v_end}') @@ -70,7 +74,7 @@ def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0): # def calculate_exoms_length(bed_intervals): -# contigs= ["chr" + str(i) for i in range(1, 23)] +# contigs= ["chr" + str(i) for i in range(1, 22)] # contigs.append("chrX") # contigs.append("chrY") # length = 0 @@ -140,45 +144,46 @@ def process_vcf_file(vcf_file, bed_file="", filter_file=False, padding=0): # mutation classes mt_classes = [0] * 6 for variant in vcf_file: - vaf.append(variant.format("AF")[1]) - total_v_count += 1 - # Counting variants in or out of exom and variants with different support reads - if "chrUn" in variant.CHROM: - v_outside_exom += 1 - if check_sp_read(variant) == "minimal": - minimal_rp_snvs_nexom += 1 - elif check_sp_read(variant) == "limited": - limited_rp_snvs_nexom += 1 - else: - strong_rp_snvs_nexom += 1 - elif check_variant_in_bed( - variant.CHROM, variant.start, variant.end, bed_file, padding=0 - ): - v_inside_exom += 1 - if check_sp_read(variant) == "minimal": - minimal_rp_snvs_exom += 1 - elif check_sp_read(variant) == "limited": - limited_rp_snvs_exom += 1 - else: - strong_rp_snvs_exom += 1 + if variant.CHROM in CONTIG: + vaf.append(float(variant.format("AF")[1][0])) + total_v_count += 1 + # Counting variants in or out of exom and variants with different support reads + if "chrUn" in variant.CHROM: + v_outside_exom += 1 + if check_sp_read(variant) == "minimal": + minimal_rp_snvs_nexom += 1 + elif check_sp_read(variant) == "limited": + limited_rp_snvs_nexom += 1 + else: + strong_rp_snvs_nexom += 1 + elif check_variant_in_bed( + variant.CHROM, variant.start, variant.end, bed_file, padding=0 + ): + v_inside_exom += 1 + if check_sp_read(variant) == "minimal": + minimal_rp_snvs_exom += 1 + elif check_sp_read(variant) == "limited": + limited_rp_snvs_exom += 1 + else: + strong_rp_snvs_exom += 1 - else: - v_outside_exom += 1 - if check_sp_read(variant) == "minimal": - minimal_rp_snvs_nexom += 1 - elif check_sp_read(variant) == "limited": - limited_rp_snvs_nexom += 1 else: - strong_rp_snvs_nexom += 1 - # get number of snvs and indels - # Need to check multi allelic. Users shouldn't input multi allelic vcf file. - if get_variant_type(variant.REF, variant.ALT[0]) == "snp": - n_snps += 1 - mt_classes = assign_class_snvs(variant, mt_classes) - else: - # More for indels - n_indels += 1 - indels_length.append(abs(len(variant.REF) - len(variant.ALT[0]))) + v_outside_exom += 1 + if check_sp_read(variant) == "minimal": + minimal_rp_snvs_nexom += 1 + elif check_sp_read(variant) == "limited": + limited_rp_snvs_nexom += 1 + else: + strong_rp_snvs_nexom += 1 + # get number of snvs and indels + # Need to check multi allelic. Users shouldn't input multi allelic vcf file. + if get_variant_type(variant.REF, variant.ALT[0]) == "snp": + n_snps += 1 + mt_classes = assign_class_snvs(variant, mt_classes) + else: + # More for indels + n_indels += 1 + indels_length.append(abs(len(variant.REF) - len(variant.ALT[0]))) return ( total_v_count, @@ -231,7 +236,6 @@ def main(): mt_classes, vaf, ) = process_vcf_file(filter_vcf, bed_intervals, filter_file=True, padding=options.padding) - # After one for loop through vcf file. The VCF will automatically be closed. # length_exoms = calculate_exoms_length(bed_intervals) # for variant in filter_vcf: @@ -255,7 +259,7 @@ def main(): "classes": classes, "VAF": vaf, } - json_object = json.dumps(summary, indent=4) + json_object = json.dumps(summary) with open(outpath, "w") as outfile: outfile.write(json_object) diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py index 197240cdf..81dd9fa87 100644 --- a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py @@ -2,11 +2,13 @@ """Wrapper for summarize information of vcf file after somatic variant calling step """ +import os from snakemake.shell import shell __author__ = "Pham Gia Cuong" __email__ = "pham.gia-cuong@bih-charite.de" +base_dir = os.path.dirname(os.path.realpath(__file__)) shell( r""" # ----------------------------------------------------------------------------- @@ -19,10 +21,10 @@ conda list > {snakemake.log.conda_list} conda info > {snakemake.log.conda_info} -python summarize-vcf.py --rawvcf {snakemake.input.raw_vcf} \ +python {base_dir}/summarize-vcf.py --rawvcf {snakemake.input.full_vcf} \ --filtered-vcf {snakemake.input.passed_vcf} \ - --exom-bedfile {snakemake.config[step_config][somatic_variants_QC][target_regions]} \ - --padding {snakemake.config[step_config][somatic_variants_QC][padding]} \ + --exom-bedfile {snakemake.config[step_config][somatic_variant_qc][target_regions]} \ + --padding {snakemake.config[step_config][somatic_variant_qc][padding]} \ --output {snakemake.output.json} pushd $(dirname {snakemake.output.json}) From bd7b4113a93a63242502ead8d05544658a201f6b Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Fri, 30 Jun 2023 20:33:03 +0200 Subject: [PATCH 03/17] Update somatic_variant_checking --- snappy_pipeline/apps/snappy_snake.py | 2 - .../somatic_variant_checking/Snakefile | 64 +++++ .../somatic_variant_checking/__init__.py | 259 +++++++++++++---- .../wrappers/bcftools/TMB/wrapper.py | 4 + .../environment.yaml | 0 .../summarize-vcf.py | 254 +++++++++++++++++ .../wrapper.py | 7 +- .../somatic_variants_qc/summarize-vcf.py | 269 ------------------ 8 files changed, 538 insertions(+), 321 deletions(-) create mode 100644 snappy_pipeline/workflows/somatic_variant_checking/Snakefile rename snappy_wrappers/wrappers/{somatic_variants_qc => somatic_variants_checking}/environment.yaml (100%) create mode 100644 snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py rename snappy_wrappers/wrappers/{somatic_variants_qc => somatic_variants_checking}/wrapper.py (76%) delete mode 100644 snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index 9348adb36..c47f50b94 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -44,7 +44,6 @@ somatic_variant_expression, somatic_variant_filtration, somatic_variant_signatures, - somatic_variant_qc, somatic_wgs_cnv_calling, somatic_wgs_sv_calling, sv_calling_targeted, @@ -100,7 +99,6 @@ "somatic_variant_expression": somatic_variant_expression, "somatic_variant_filtration": somatic_variant_filtration, "somatic_variant_signatures": somatic_variant_signatures, - "somatic_variant_qc": somatic_variant_qc, "somatic_wgs_cnv_calling": somatic_wgs_cnv_calling, "somatic_wgs_sv_calling": somatic_wgs_sv_calling, "sv_calling_targeted": sv_calling_targeted, diff --git a/snappy_pipeline/workflows/somatic_variant_checking/Snakefile b/snappy_pipeline/workflows/somatic_variant_checking/Snakefile new file mode 100644 index 000000000..b73514711 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_checking/Snakefile @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline tumor mutational burden step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_variant_checking import ( + SomaticVariantQCWorkflow, +) + +__author__ = "Gia Cuong Pham" +__email__ = "pham.gia-cuong@bih-charite.de" + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== +wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + + +localrules: + # Linking files from work/ to output/ should be done locally + somatic_variant_qc_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# Generic linking out --------------------------------------------------------- + + +rule somatic_variant_qc_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +rule somatic_variant_qc: + input: + **wf.get_input_files("SVQC_step", "run"), + output: + **wf.get_output_files("SVQC_step", "run"), + threads: wf.get_resource("SVQC_step", "run", "threads") + resources: + time=wf.get_resource("SVQC_step", "run", "time"), + memory=wf.get_resource("SVQC_step", "run", "memory"), + partition=wf.get_resource("SVQC_step", "run", "partition"), + tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"), + log: + **wf.get_log_file("SVQC_step", "run"), + wrapper: + wf.wrapper_path("somatic_variants_checking") diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py index aea5ebf23..5c1367333 100644 --- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py @@ -1,56 +1,219 @@ -# -*- coding: utf-8 -*- -"""Implementation of the germline ``somatic_variant_checking`` step +import os +import sys -The ``somatic_variant_checking`` step takes as the input the results of the -``somatic_variant_annotation`` step. It then executes various tools computing statistics on the -result files and consistency checks with the pedigrees. +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import expand -.. note:: +from snappy_pipeline.base import UnsupportedActionException +from snappy_pipeline.utils import dictify, listify +from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from snappy_pipeline.workflows.somatic_variant_calling import ( + SOMATIC_VARIANT_CALLERS_MATCHED, + SomaticVariantCallingWorkflow, +) - Status: not implemented yet +#: Extensions of files to create as main payload +EXT_VALUES = (".json", ".json.md5") -========== -Step Input -========== +#: Names of the files to create for the extension +EXT_NAMES = ("json", "json_md5") -The variant calling step uses Snakemake sub workflows for using the result of the -``somatic_variant_annotation`` step. - -=========== -Step Output -=========== - -.. note:: TODO - -==================== -Global Configuration -==================== - -.. note:: TODO - -===================== -Default Configuration -===================== - -The default configuration is as follows. - -.. include:: DEFAULT_CONFIG_somatic_variant_checking.rst - -======= -Reports -======= - -Currently, no reports are generated. -""" - -__author__ = "Manuel Holtgrewe " - -#: Available tools for checking variants -VARIANT_CHECKERS = ("bcftools_stats", "peddy") - -#: Default configuration for the somatic_gene_fusion_calling step +#: Default configuration for the tmb calculation step DEFAULT_CONFIG = r""" step_config: - somatic_variant_checking: - path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED + somatic_variant_checking: + path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED + tools_ngs_mapping: [] # default to those configured for ngs_mapping + tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling + target_regions: # REQUIRED + padding: 0 #Used for count the number of variants outside of exom + padding + ignore_regions: [] + minimal_support_read: 1 + limited_support_read: 5 """ + + +class SomaticVariantQCStepPart(BaseStepPart): + """""" + + name = "SVQC_step" + + actions = ("run",) + + def __init__(self, parent): + super().__init__(parent) + + @dictify + def get_input_files(self, action): + self._validate_action(action) + tpl = ( + "output/{mapper}.{var_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{tumor_library}" + ) + key_ext = { + "full_vcf": ".full.vcf.gz", + "full_vcf_tbi": ".full.vcf.gz.tbi", + "passed_vcf": ".vcf.gz", + "passed_vcf_tbi": ".vcf.gz.tbi", + } + variant_calling = self.parent.sub_workflows["somatic_variant_calling"] + for key, ext in key_ext.items(): + yield key, variant_calling(tpl + ext) + + @dictify + def get_output_files(self, action): + # Validate action + self._validate_action(action) + prefix = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + key_ext = {"json": ".json"} + for key, ext in key_ext.items(): + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + @dictify + def _get_log_file(self, action): + assert action == "run" + prefix = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + + def get_resource_usage(self, action): + """Get Resource Usage + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + :return: Returns ResourceUsage for step. + :raises UnsupportedActionException: if action not in class defined list of valid actions. + """ + if action not in self.actions: + actions_str = ", ".join(self.actions) + error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" + raise UnsupportedActionException(error_message) + mem_mb = 4 * 1024 # 4GB + return ResourceUsage( + threads=2, + time="1:00:00", # 1 hour + memory=f"{mem_mb}M", + ) + + +class SomaticVariantQCWorkflow(BaseStep): + """Perform gathering variant information""" + + name = "somaticvariantchecking" + sheet_shortcut_class = CancerCaseSheet + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one.""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + (SomaticVariantCallingWorkflow, NgsMappingWorkflow), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart)) + # Register sub workflows + self.register_sub_workflow( + "somatic_variant_calling", + self.w_config["step_config"]["somatic_variant_checking"][ + "path_somatic_variant_calling" + ], + ) + # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here + if not self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"]: + self.w_config["step_config"]["somatic_variant_checking"][ + "tools_ngs_mapping" + ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + if not self.w_config["step_config"]["somatic_variant_checking"][ + "tools_somatic_variant_calling" + ]: + self.w_config["step_config"]["somatic_variant_checking"][ + "tools_somatic_variant_calling" + ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] + + @listify + def get_result_files(self): + callers = set( + self.w_config["step_config"]["somatic_variant_checking"][ + "tools_somatic_variant_calling" + ] + ) + name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}" + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=EXT_VALUES, + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) + + def _yield_result_files_matched(self, tpl, **kwargs): + """Build output paths from path template and extension list. + + This function returns the results from the matched somatic variant callers such as + Mutect. + """ + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + yield from expand( + tpl, + tumor_library=[sample_pair.tumor_sample.dna_ngs_library], + **kwargs, + ) + + def check_config(self): + """Check that the path to the NGS mapping is present""" + self.ensure_w_config( + ("step_config", "somatic_variant_checking", "path_somatic_variant_calling"), + "Path to variant calling not configured but required for somatic variant qc", + ) + + self.ensure_w_config( + ("step_config", "somatic_variant_checking", "target_regions"), + "Path to target_regions file (bed format)" + "not configured but required for somatic variant qc", + ) diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py index e06829437..1408cff2a 100644 --- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py @@ -26,6 +26,10 @@ name_vcf=$(basename {snakemake.input.vcf}) vcf_md5=$(md5sum {snakemake.input.vcf} | awk '{{print $1}}') +cat_cmd=zcat #in case that bed file has gzip format +gzip -t $bed_file || cat_cmd=cat +total_exom_length=$($cat_cmd $bed_file | \ + awk '{{dis+=$3-$2}} END {{print dis}}') total_exom_length=$(zcat $bed_file | \ awk '{{dis+=$3-$2}} END {{print dis}}') #TMB_rounded=`printf "%.3f" $TMB` diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml similarity index 100% rename from snappy_wrappers/wrappers/somatic_variants_qc/environment.yaml rename to snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py new file mode 100644 index 000000000..09b8f9bfa --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -0,0 +1,254 @@ +# import packages + +import json # for writing out the json file +from cyvcf2 import VCF # for reading vcf file +import gzip +import tabix +import argparse + +parser = argparse.ArgumentParser(description="Gathering information of variants") +parser.add_argument( + "--rawvcf", + help="raw vcf file", +) +parser.add_argument( + "--filtered-vcf", + help="filtered vcf file", +) +parser.add_argument( + "--exom-bedfile", + help="exom regions bed file", +) +parser.add_argument("--padding", help="it's padding", default=0) +parser.add_argument( + "--ignore-regions", + help="bed file contains repeated or difficult to map regions", +) +parser.add_argument( + "-m", + "--minimal", + type=int, + default=1, + help="threshold for defining a variant that has minimal support reads", +) +parser.add_argument( + "-l", + "--limited", + type=int, + default=5, + help="threshold for defining a variant that has limited support reads", +) +parser.add_argument( + "--output", + help="json file contains information about vcf file", +) +args = parser.parse_args() + + +def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0): + try: + tmp = [] + records = bed_intervals.query(v_chrom, v_start - padding, v_end + padding) + for record in records: + tmp.append(record) + if len(tmp) >= 1: + return True + else: + return False + except OSError: + print("You should index the bed file with tabix first!") + + +def get_contigs_from_bed_file(bedfile): + contigs = [] + try: + with gzip.open(bedfile, "r") as intervals: + for interval in intervals: + line = interval.decode() + tmp = line.split("\t")[0] + if tmp not in contigs: + contigs.append(tmp) + return contigs + except OSError: + print("Error reading the GZIP file.") + + +def get_variant_type(ref, alt): + if (len(alt) == 1) and (len(ref) == len(alt)): + return "snp" + elif len(alt) != len(ref): + return "indels" + + +def check_sp_read(variant, minimal, limited): + dp = variant.INFO["DP"] + if dp <= minimal: + return "minimal" + elif (dp > minimal) and (dp <= limited): + return "limited" + elif dp >= limited: + return "strong" + + +def assign_class_snvs(variant, mt_mat): + temp = str(variant.REF) + ">" + str(variant.ALT[0]) + # Following result from plot-vcfstats with bcftools + if temp in ["A>C", "T>G"]: + mt_mat[0] += 1 + elif temp in ["A>G", "T>C"]: + mt_mat[1] += 1 + elif temp in ["A>T", "T>A"]: + mt_mat[2] += 1 + elif temp in ["C>G", "G>C"]: + mt_mat[3] += 1 + elif temp in ["C>T", "G>A"]: + mt_mat[4] += 1 + elif temp in ["G>T", "C>A"]: + mt_mat[5] += 1 + return mt_mat + + +def process_vcf_file( + vcf_file, + contigs=[], + hard_contigs=[], + minimal=1, + limited=5, + bed_file="", + hard_file="", + filter_file=False, + padding=0, +): + infor = { + "total_v_count": 0, + "v_inside_exom": 0, + "v_outside_exom": 0, + "v_in_hard_regions": 0, + "n_snps": 0, + "n_indels": 0, + "minimal_rp_snvs_exom": 0, + "minimal_rp_snvs_nexom": 0, + "limited_rp_snvs_exom": 0, + "limited_rp_snvs_nexom": 0, + "strong_rp_snvs_exom": 0, + "strong_rp_snvs_nexom": 0, + "indels_length": [], + "mt_classes": [0] * 6, + "vaf": [], + } + if filter_file: + for variant in vcf_file: + infor["total_v_count"] += 1 + # Gathering information of variants in comparison to interested regions + if variant.CHROM in contigs: + if len(variant.ALT) > 1: + raise ValueError( + "Multiallelic variants are not supported" + ) # checking for multiallelic + infor["vaf"].append(float(variant.format("AF")[1][0])) + # Counting variants in or out of exom and variants with different support reads + if check_variant_in_bed( + variant.CHROM, variant.start, variant.end, bed_file, padding=0 + ): + infor["v_inside_exom"] += 1 + infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_exom"] += 1 + else: + infor["v_outside_exom"] += 1 + infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_nexom"] += 1 + + # Need to check multi allelic. Users shouldn't input multi allelic vcf file. + if get_variant_type(variant.REF, variant.ALT[0]) == "snp": + infor["n_snps"] += 1 + infor["mt_classes"] = assign_class_snvs(variant, infor["mt_classes"]) + else: + # More for indels + infor["n_indels"] += 1 + infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0]))) + # Gathering information of variants in comparison to hard mapped regions + + if variant.CHROM in hard_contigs: + if check_variant_in_bed( + variant.CHROM, variant.start, variant.end, hard_file, padding=0 + ): + infor["v_in_hard_regions"] += 1 + return infor + else: + total_v_count = 0 + for variant in vcf_file: + total_v_count += 1 + return total_v_count + + +def main(): + classes = ["T>G", "T>C", "T>A", "C>G", "C>T", "C>A"] + # reading input + full_vcf = VCF(args.rawvcf) + filter_vcf = VCF(args.filtered_vcf) + sample = filter_vcf.samples[1] + ###################### + bed_intervals = tabix.open(args.exom_bedfile) + contigs = get_contigs_from_bed_file(args.exom_bedfile) + full_v_count = process_vcf_file(full_vcf) + # read bed file + if args.ignore_regions: + path_hard_regions = args.ignore_regions + hard_intervals = tabix.open(path_hard_regions) + hard_contigs = get_contigs_from_bed_file(path_hard_regions) + infor = process_vcf_file( + filter_vcf, + contigs, + hard_contigs, + args.minimal, + args.limited, + bed_intervals, + hard_intervals, + filter_file=True, + padding=args.padding, + ) + else: + infor = process_vcf_file( + filter_vcf, + contigs, + [], + args.minimal, + args.limited, + bed_intervals, + "", + filter_file=True, + padding=args.padding, + ) + + summary = { + "sample": sample, + "number_full_variants": full_v_count, + "number_passed_variants": infor["total_v_count"], + "number_snvs_inside_exom": infor["v_inside_exom"], + "number_snvs_outside_exom": infor["v_outside_exom"], + "number_snvs_in_hard_regions": infor["v_in_hard_regions"], + "number_snps": infor["n_snps"], + "number_indels": infor["n_indels"], + "minimal_rp_snvs_exom": infor["minimal_rp_snvs_exom"], + "minimal_rp_snvs_nexom": infor["minimal_rp_snvs_nexom"], + "limited_rp_snvs_exom": infor["limited_rp_snvs_exom"], + "limited_rp_snvs_nexom": infor["limited_rp_snvs_nexom"], + "strong_rp_snvs_exom": infor["strong_rp_snvs_exom"], + "strong_rp_snvs_nexom": infor["strong_rp_snvs_nexom"], + classes[0]: infor["mt_classes"][0], + classes[1]: infor["mt_classes"][1], + classes[2]: infor["mt_classes"][2], + classes[3]: infor["mt_classes"][3], + classes[4]: infor["mt_classes"][4], + classes[5]: infor["mt_classes"][5], + "indels_length": infor["indels_length"], + "VAF": infor["vaf"], + } + if not args.ignore_regions: + del summary["number_snvs_in_hard_regions"] + json_object = json.dumps(summary) + + with open(args.output, "w") as outfile: + outfile.write(json_object) + + +if __name__ == "__main__": + main() diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py similarity index 76% rename from snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py rename to snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py index 81dd9fa87..c7b0239cc 100644 --- a/snappy_wrappers/wrappers/somatic_variants_qc/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py @@ -23,8 +23,11 @@ python {base_dir}/summarize-vcf.py --rawvcf {snakemake.input.full_vcf} \ --filtered-vcf {snakemake.input.passed_vcf} \ - --exom-bedfile {snakemake.config[step_config][somatic_variant_qc][target_regions]} \ - --padding {snakemake.config[step_config][somatic_variant_qc][padding]} \ + --exom-bedfile {snakemake.config[step_config][somatic_variant_checking][target_regions]} \ + --padding {snakemake.config[step_config][somatic_variant_checking][padding]} \ + --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \ + --limited {snakemake.config[step_config][somatic_variant_checking][limited_support_read]} \ + --ignore-regions {snakemake.config[step_config][somatic_variant_checking][ignore_regions]} \ --output {snakemake.output.json} pushd $(dirname {snakemake.output.json}) diff --git a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py deleted file mode 100644 index 19a3e9087..000000000 --- a/snappy_wrappers/wrappers/somatic_variants_qc/summarize-vcf.py +++ /dev/null @@ -1,269 +0,0 @@ -# import packages - -import json # for writing out the json file -from cyvcf2 import VCF # for reading vcf file -import tabix -from optparse import OptionParser - - -parser = OptionParser() -parser.add_option("--rawvcf", help="raw vcf file") -parser.add_option("--filtered-vcf", help="filtered vcf file") -parser.add_option( - "--exom-bedfile", - help="exom regions bed file", -) -parser.add_option("--padding", help="it's padding", default=0) -parser.add_option( - "--repeated-region-bedfile", - help="bed file contains repeated or difficult to map regions", -) -parser.add_option( - "--output", - help="json file contains information about vcf file", -) -(options, args) = parser.parse_args() - -# from pybedtools import Interval,BedTool - -# I will use for loop here -# It could be better by using multi threading. -# TODO: -# - Number of variants called (from the `*.full.vcf.gz`file) [X] Get all variants from full file -# - Number of variants that pass all filters [X] Get all variants from fitlered file -# - Number of variants passing the filters that are inside & outside of the exome bed file (with padding) [X] -# - Number of variants in repeated or other difficult to map regions [ ] Need a bed file containing repeated -# - Number of SNVs & indels [X] -# - Indel lengths statistics [X] -# - Number of variants with more than 1 ALT allele (I _think_ that this value should be 0 for all variants passing the filters) [ ] -# - Number of SNVs with minimal support (only one read supporting the variant), possibly separating inside & outside exome bed [X] -# - Number of SNVs with limited support (5 reads or less supporting the variant), possibly separating inside & outside exome bed [X] -# - Number of SNVs with strong support (at least 10 reads, VAF greater or equal to 10%) [X] VAF need to be joined -# - Number of SNVs in each mutation class (C>A, C>G, C>T, T>A, T>C, T>G), for minimal, limited, average & strong support [X] not for minimal,limited,average yet -# - Metrics on strand bias could also be collected (F1R2 & F2R1 vcf FORMAT) [ ] PAUSE -# - the VAF distribution. Perhaps we can split the variants (both SNVs & indels) into those with very little support (<1%), subclonal (<10%), the main part (<40%), affected by CNV (>40%)[ ] - - -# Adding A>C G>T grouping -# Table for VAF -# Genotype check > report different genotypes -# Checking number multi allelic -# Support alternative allel in the normal -# #alt in normal + average ALT + max ALT -# snappy pipeline for this - -##### SNAPPY PIPELINE -## WRAPPER -# - Checking the configuration file -# - Checking input file -## PIPELINE -# - __init__ : - -CONTIG = ["chr" + str(i) for i in range(1, 22)] -CONTIG.append("chrX") -CONTIG.append("chrY") - - -def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0): - # print(f'variant chrom: {v_chrom} - start: {v_start} - end:{v_end}') - record = bed_intervals.query(v_chrom, v_start - padding, v_end + padding) - if record: - return True - else: - return False - - -# def calculate_exoms_length(bed_intervals): -# contigs= ["chr" + str(i) for i in range(1, 22)] -# contigs.append("chrX") -# contigs.append("chrY") -# length = 0 -# for contig in contigs: -# records = bed_intervals.query(contig,0,250000000) -# for record in records: -# length = length + int(record[2]) - int(record[1]) -# return length - - -def get_variant_type(ref, alt): - if (len(alt) == 1) and (len(ref) == len(alt)): - return "snp" - elif len(alt) != len(ref): - return "indels" - - -def check_sp_read(variant): - if get_variant_type(variant.REF, variant.ALT[0]) == "snp": - dp = variant.INFO["DP"] - if dp == 1: - return "minimal" - elif (dp > 1) and (dp <= 5): - return "limited" - if dp >= 10: - return "strong" - - -def assign_class_snvs(variant, mt_mat): - temp = str(variant.REF) + ">" + str(variant.ALT[0]) - # Following result from plot-vcfstats with bcftools - match temp: - case "A>C" | "C>A": - mt_mat[0] += 1 - case "A>G" | "G>A": - mt_mat[1] += 1 - case "A>T" | "T>A": - mt_mat[2] += 1 - case "C>G" | "G>C": - mt_mat[3] += 1 - case "C>T" | "T>C": - mt_mat[4] += 1 - case "G>T" | "T>G": - mt_mat[5] += 1 - return mt_mat - - -def process_vcf_file(vcf_file, bed_file="", filter_file=False, padding=0): - total_v_count = 0 - if filter_file: - # VAF - vaf = [] - # numb of variants in and out of exoms - v_inside_exom = 0 - v_outside_exom = 0 - # numb of snps and indels - n_snps = 0 - n_indels = 0 - indels_length = [] - # support read informations - minimal_rp_snvs_exom = 0 - minimal_rp_snvs_nexom = 0 - limited_rp_snvs_exom = 0 - limited_rp_snvs_nexom = 0 - strong_rp_snvs_exom = 0 - strong_rp_snvs_nexom = 0 - # mutation classes - mt_classes = [0] * 6 - for variant in vcf_file: - if variant.CHROM in CONTIG: - vaf.append(float(variant.format("AF")[1][0])) - total_v_count += 1 - # Counting variants in or out of exom and variants with different support reads - if "chrUn" in variant.CHROM: - v_outside_exom += 1 - if check_sp_read(variant) == "minimal": - minimal_rp_snvs_nexom += 1 - elif check_sp_read(variant) == "limited": - limited_rp_snvs_nexom += 1 - else: - strong_rp_snvs_nexom += 1 - elif check_variant_in_bed( - variant.CHROM, variant.start, variant.end, bed_file, padding=0 - ): - v_inside_exom += 1 - if check_sp_read(variant) == "minimal": - minimal_rp_snvs_exom += 1 - elif check_sp_read(variant) == "limited": - limited_rp_snvs_exom += 1 - else: - strong_rp_snvs_exom += 1 - - else: - v_outside_exom += 1 - if check_sp_read(variant) == "minimal": - minimal_rp_snvs_nexom += 1 - elif check_sp_read(variant) == "limited": - limited_rp_snvs_nexom += 1 - else: - strong_rp_snvs_nexom += 1 - # get number of snvs and indels - # Need to check multi allelic. Users shouldn't input multi allelic vcf file. - if get_variant_type(variant.REF, variant.ALT[0]) == "snp": - n_snps += 1 - mt_classes = assign_class_snvs(variant, mt_classes) - else: - # More for indels - n_indels += 1 - indels_length.append(abs(len(variant.REF) - len(variant.ALT[0]))) - - return ( - total_v_count, - v_inside_exom, - v_outside_exom, - n_snps, - n_indels, - indels_length, - minimal_rp_snvs_exom, - minimal_rp_snvs_nexom, - limited_rp_snvs_exom, - limited_rp_snvs_nexom, - strong_rp_snvs_exom, - strong_rp_snvs_nexom, - mt_classes, - vaf, - ) - else: - for variant in vcf_file: - total_v_count += 1 - return total_v_count - - -def main(): - classes = ["A>C", "A>G", "A>T", "C>G", "C>T", "G>T"] - path_full_vcf = options.rawvcf - path_filtered_vcf = options.filtered_vcf - path_bedfile = options.exom_bedfile - outpath = options.output - full_vcf = VCF(path_full_vcf) - filter_vcf = VCF(path_filtered_vcf) - # header = filter_vcf.raw_header - sample = filter_vcf.samples[1] - # read bed file - bed_intervals = tabix.open(path_bedfile) - full_v_count = process_vcf_file(full_vcf) - ( - passed_v_count, - v_inside_exom, - v_outside_exom, - n_snps, - n_indels, - indels_length, - minimal_rp_snvs_exom, - minimal_rp_snvs_nexom, - limited_rp_snvs_exom, - limited_rp_snvs_nexom, - strong_rp_snvs_exom, - strong_rp_snvs_nexom, - mt_classes, - vaf, - ) = process_vcf_file(filter_vcf, bed_intervals, filter_file=True, padding=options.padding) - # After one for loop through vcf file. The VCF will automatically be closed. - # length_exoms = calculate_exoms_length(bed_intervals) - # for variant in filter_vcf: - # print(variant.format("AF")) - summary = { - "sample": sample, - "number_full_variants": full_v_count, - "number_passed_variants": passed_v_count, - "number_snvs_inside_exom": v_inside_exom, - "number_snvs_outside_exom": v_outside_exom, - "number_snps": n_snps, - "number_indels": n_indels, - "indels_length": indels_length, - "minimal_rp_snvs_exom": minimal_rp_snvs_exom, - "minimal_rp_snvs_nexom": minimal_rp_snvs_nexom, - "limited_rp_snvs_exom": limited_rp_snvs_exom, - "limited_rp_snvs_nexom": limited_rp_snvs_nexom, - "strong_rp_snvs_exom": strong_rp_snvs_exom, - "strong_rp_snvs_nexom": strong_rp_snvs_nexom, - "mutation_classes": mt_classes, - "classes": classes, - "VAF": vaf, - } - json_object = json.dumps(summary) - - with open(outpath, "w") as outfile: - outfile.write(json_object) - - -if __name__ == "__main__": - main() From de363703351bcf851edcb7839a4d4634e73cbf85 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Mon, 3 Jul 2023 20:48:04 +0200 Subject: [PATCH 04/17] adding test for somatic_variant_checking --- .../summarize-vcf.py | 3 +- ...test_workflows_somatic_variant_checking.py | 187 ++++++++++++++++++ 2 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index 09b8f9bfa..051c41921 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -1,7 +1,6 @@ # import packages - -import json # for writing out the json file from cyvcf2 import VCF # for reading vcf file +import json # for writing out the json file import gzip import tabix import argparse diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py new file mode 100644 index 000000000..378a2fdfe --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- +"""Tests for the somatic_variant_calling workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml + +from snappy_pipeline.workflows.somatic_variant_checking import ( + SomaticVariantQCWorkflow, +) + +from .common import get_expected_log_files_dict, get_expected_output_json_files_dict +from .conftest import patch_module_fs + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + compute_coverage_bed: true + path_target_regions: /path/to/regions.bed + bwa: + path_index: /path/to/bwa/index.fa + + somatic_variant_calling: + tools: + - mutect2 + - scalpel + scalpel: + path_target_regions: /path/to/target/regions.bed + + somatic_variant_checking: + path_somatic_variant_calling: ../somatic_variant_calling + tools_ngs_mapping: [] + tools_somatic_variant_calling: [] + target_regions: /path/to/regions.bed + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def somatic_variant_checking_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + """Return SomaticVariantCheckingWorkflow object pre-configured with cancer sheet""" + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variant_calling": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + } + # Construct the workflow object + return SomaticVariantQCWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + +# Tests for SomaticVariantCheckingStepPart ----------------------------------------------------- + + +def test_somatic_variant_checking_step_part_get_input_files(somatic_variant_checking_workflow): + """Test SomaticVariantCheckingStepPart.get_input_files()""" + base_out = ( + "SOMATIC_VARIANT_CALLING/output/{mapper}.{var_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{tumor_library}" + ) + expected = { + "full_vcf": base_out + ".full.vcf.gz", + "full_vcf_tbi": base_out + ".full.vcf.gz.tbi", + "passed_vcf": base_out + ".vcf.gz", + "passed_vcf_tbi": base_out + ".vcf.gz.tbi", + } + actual = somatic_variant_checking_workflow.get_input_files("SVQC_step", "run") + assert actual == expected + + +def test_somatic_variant_checking_step_part_get_output_files(somatic_variant_checking_workflow): + """Tests SomaticVariantCheckingStepPart.get_output_files()""" + base_out = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + expected = get_expected_output_json_files_dict(base_out=base_out) + actual = somatic_variant_checking_workflow.get_output_files("SVQC_step", "run") + assert actual == expected + + +def test_somatic_variant_checking_step_part_get_log_files(somatic_variant_checking_workflow): + """Tests SomaticVariantCheckingStepPart.get_log_files()""" + base_out = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = somatic_variant_checking_workflow.get_log_file("SVQC_step", "run") + assert actual == expected + + +def test_somatic_variant_checking_step_part_get_resource_usage( + somatic_variant_checking_workflow, +): + """Tests SomaticVariantCheckingStepPart.get_resource_usage()""" + # Define expected + expected_dict = {"threads": 2, "time": "1:00:00", "memory": "4096M"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_variant_checking_workflow.get_resource("SVQC_step", "run", resource) + assert actual == expected, msg_error + + +# Tests for SomaticVariantCheckingWorkflow ------------------------------------------------------- + + +def test_somatic_variant_checking_workflow(somatic_variant_checking_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["SVQC_step","link_out"] + actual = list(sorted(somatic_variant_checking_workflow.sub_steps.keys())) + assert actual == expected + + # Check result file construction + tpl = ( + "output/{mapper}.{var_caller}.variantsqc.P00{i}-T{t}-DNA1-WGS1/{dir_}/" + "{mapper}.{var_caller}.variantsqc.P00{i}-T{t}-DNA1-WGS1.{ext}" + ) + expected = [ + tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext, dir_="out") + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ("json", "json.md5") + for mapper in ("bwa",) + for var_caller in ("mutect2", "scalpel") + ] + expected += [ + tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext, dir_="log") + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + "conda_info.txt", + "conda_list.txt", + "log", + "conda_info.txt.md5", + "conda_list.txt.md5", + "log.md5", + ) + for mapper in ("bwa",) + for var_caller in ("mutect2", "scalpel") + ] + expected = list(sorted(expected)) + actual = list(sorted(somatic_variant_checking_workflow.get_result_files())) + assert expected == actual From 09d808877a18b8a8c566cc1708aed10cc8611aaf Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Mon, 3 Jul 2023 20:56:24 +0200 Subject: [PATCH 05/17] format checking --- .../test_workflows_somatic_variant_checking.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py index 378a2fdfe..a1c539bd6 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py @@ -46,9 +46,9 @@ def minimal_config(): path_target_regions: /path/to/target/regions.bed somatic_variant_checking: - path_somatic_variant_calling: ../somatic_variant_calling - tools_ngs_mapping: [] - tools_somatic_variant_calling: [] + path_somatic_variant_calling: ../somatic_variant_calling + tools_ngs_mapping: [] + tools_somatic_variant_calling: [] target_regions: /path/to/regions.bed data_sets: @@ -92,6 +92,7 @@ def somatic_variant_checking_workflow( work_dir, ) + # Tests for SomaticVariantCheckingStepPart ----------------------------------------------------- @@ -131,7 +132,7 @@ def test_somatic_variant_checking_step_part_get_log_files(somatic_variant_checki expected = get_expected_log_files_dict(base_out=base_out) actual = somatic_variant_checking_workflow.get_log_file("SVQC_step", "run") assert actual == expected - + def test_somatic_variant_checking_step_part_get_resource_usage( somatic_variant_checking_workflow, @@ -152,7 +153,7 @@ def test_somatic_variant_checking_step_part_get_resource_usage( def test_somatic_variant_checking_workflow(somatic_variant_checking_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["SVQC_step","link_out"] + expected = ["SVQC_step", "link_out"] actual = list(sorted(somatic_variant_checking_workflow.sub_steps.keys())) assert actual == expected From 73416b04584ec2cdd15f20287742ff7ed5ef7a3d Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Mon, 3 Jul 2023 21:02:16 +0200 Subject: [PATCH 06/17] fixing format --- .../wrappers/somatic_variants_checking/summarize-vcf.py | 7 ++++--- .../wrappers/somatic_variants_checking/wrapper.py | 1 + .../workflows/test_workflows_somatic_variant_checking.py | 4 +--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index 051c41921..38d634d1a 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -1,9 +1,10 @@ # import packages -from cyvcf2 import VCF # for reading vcf file -import json # for writing out the json file +import argparse import gzip +import json # for writing out the json file + +from cyvcf2 import VCF # for reading vcf file import tabix -import argparse parser = argparse.ArgumentParser(description="Gathering information of variants") parser.add_argument( diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py index c7b0239cc..13295af50 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py @@ -3,6 +3,7 @@ somatic variant calling step """ import os + from snakemake.shell import shell __author__ = "Pham Gia Cuong" diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py index a1c539bd6..2882fbd2b 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py @@ -6,9 +6,7 @@ import pytest import ruamel.yaml as ruamel_yaml -from snappy_pipeline.workflows.somatic_variant_checking import ( - SomaticVariantQCWorkflow, -) +from snappy_pipeline.workflows.somatic_variant_checking import SomaticVariantQCWorkflow from .common import get_expected_log_files_dict, get_expected_output_json_files_dict from .conftest import patch_module_fs From c30f22e4affa1c4f34ea8d7c76cbcf38d654615c Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Sat, 8 Jul 2023 23:37:10 +0200 Subject: [PATCH 07/17] Finalizing somatic variant checking --- .../somatic_variant_checking/__init__.py | 14 +- .../workflows/somatic_variant_qc/Snakefile | 64 ----- .../workflows/somatic_variant_qc/__init__.py | 223 ------------------ .../tumor_mutational_burden/__init__.py | 6 +- .../wrappers/bcftools/TMB/wrapper.py | 24 +- .../environment.yaml | 4 +- .../summarize-vcf.py | 125 +++++----- .../somatic_variants_checking/wrapper.py | 1 + 8 files changed, 90 insertions(+), 371 deletions(-) delete mode 100644 snappy_pipeline/workflows/somatic_variant_qc/Snakefile delete mode 100644 snappy_pipeline/workflows/somatic_variant_qc/__init__.py diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py index 5c1367333..88254465f 100644 --- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py @@ -27,10 +27,11 @@ tools_ngs_mapping: [] # default to those configured for ngs_mapping tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling target_regions: # REQUIRED - padding: 0 #Used for count the number of variants outside of exom + padding - ignore_regions: [] - minimal_support_read: 1 - limited_support_read: 5 + padding: 0 # Used for count the number of variants outside of exom + padding + AF_ID: 'AF' # REQUIRED ID of allele frequency field used in vcf file + ignore_regions: "" # hard mapping regions + minimal_support_read: 1 # threshold for defining a variant that has minimal support reads + limited_support_read: 5 # threshold for defining a variant that has limited support reads """ @@ -189,10 +190,7 @@ def _yield_result_files_matched(self, tpl, **kwargs): """ for sheet in filter(is_not_background, self.shortcut_sheets): for sample_pair in sheet.all_sample_pairs: - if ( - not sample_pair.tumor_sample.dna_ngs_library - or not sample_pair.normal_sample.dna_ngs_library - ): + if not sample_pair.tumor_sample.dna_ngs_library: msg = ( "INFO: sample pair for cancer bio sample {} has is missing primary" "normal or primary cancer NGS library" diff --git a/snappy_pipeline/workflows/somatic_variant_qc/Snakefile b/snappy_pipeline/workflows/somatic_variant_qc/Snakefile deleted file mode 100644 index 2ac7ab432..000000000 --- a/snappy_pipeline/workflows/somatic_variant_qc/Snakefile +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- -"""CUBI Pipeline tumor mutational burden step Snakefile""" - -import os - -from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_variant_qc import ( - SomaticVariantQCWorkflow, -) - -__author__ = "Gia Cuong Pham" -__email__ = "pham.gia-cuong@bih-charite.de" - - -# Configuration =============================================================== - - -configfile: "config.yaml" - - -# Expand "$ref" JSON pointers in configuration (also works for YAML) -config, lookup_paths, config_paths = expand_ref("config.yaml", config) - -# WorkflowImpl Object Setup =================================================== -wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) - - -localrules: - # Linking files from work/ to output/ should be done locally - somatic_variant_qc_link_out_run, - - -rule all: - input: - wf.get_result_files(), - - -# Generic linking out --------------------------------------------------------- - - -rule somatic_variant_qc_link_out_run: - input: - wf.get_input_files("link_out", "run"), - output: - wf.get_output_files("link_out", "run"), - run: - shell(wf.get_shell_cmd("link_out", "run", wildcards)) - - -rule somatic_variant_qc: - input: - **wf.get_input_files("SVQC_step", "run"), - output: - **wf.get_output_files("SVQC_step", "run"), - threads: wf.get_resource("SVQC_step", "run", "threads") - resources: - time=wf.get_resource("SVQC_step", "run", "time"), - memory=wf.get_resource("SVQC_step", "run", "memory"), - partition=wf.get_resource("SVQC_step", "run", "partition"), - tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"), - log: - **wf.get_log_file("SVQC_step", "run"), - wrapper: - wf.wrapper_path("somatic_variants_qc") diff --git a/snappy_pipeline/workflows/somatic_variant_qc/__init__.py b/snappy_pipeline/workflows/somatic_variant_qc/__init__.py deleted file mode 100644 index 5ce9c4349..000000000 --- a/snappy_pipeline/workflows/somatic_variant_qc/__init__.py +++ /dev/null @@ -1,223 +0,0 @@ -from collections import OrderedDict -import os -import sys - -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background -from snakemake.io import expand - -from snappy_pipeline.base import UnsupportedActionException -from snappy_pipeline.utils import dictify, listify -from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart -from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage -from snappy_pipeline.workflows.somatic_variant_calling import ( - SOMATIC_VARIANT_CALLERS_MATCHED, - SomaticVariantCallingWorkflow, -) - -#: Extensions of files to create as main payload -EXT_VALUES = (".json", ".json.md5") - -#: Names of the files to create for the extension -EXT_NAMES = ("json", "json_md5") - -#: Default configuration for the tmb calculation step -DEFAULT_CONFIG = r""" -step_config: - somatic_variant_qc: - path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED - tools_ngs_mapping: [] # default to those configured for ngs_mapping - tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling - target_regions: # REQUIRED - padding: 0 #Used for count the number of variants outside of exom + padding - repeated_regions: [] #need to confirm -""" - - -class SomaticVariantQCStepPart(BaseStepPart): - """""" - - name = "SVQC_step" - - actions = ("run",) - - def __init__(self, parent): - super().__init__(parent) - self.tumor_ngs_library_to_sample_pair = OrderedDict() - for sheet in self.parent.shortcut_sheets: - # update function of OrderedDict - self.tumor_ngs_library_to_sample_pair.update( - sheet.all_sample_pairs_by_tumor_dna_ngs_library - ) - # Build mapping from donor name to donor. - self.donors = OrderedDict() - for sheet in self.parent.shortcut_sheets: - for donor in sheet.donors: - self.donors[donor.name] = donor - - @dictify - def get_input_files(self, action): - self._validate_action(action) - tpl = ( - "output/{mapper}.{var_caller}.{tumor_library}/out/" - "{mapper}.{var_caller}.{tumor_library}" - ) - key_ext = { - "full_vcf": ".full.vcf.gz", - "full_vcf_tbi": ".full.vcf.gz.tbi", - "passed_vcf": ".vcf.gz", - "passed_vcf_tbi": ".vcf.gz.tbi", - } - variant_calling = self.parent.sub_workflows["somatic_variant_calling"] - for key, ext in key_ext.items(): - yield key, variant_calling(tpl + ext) - - @dictify - def get_output_files(self, action): - # Validate action - self._validate_action(action) - prefix = ( - "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/" - "{mapper}.{var_caller}.variantsqc.{tumor_library}" - ) - key_ext = {"json": ".json"} - for key, ext in key_ext.items(): - yield key, prefix + ext - yield key + "_md5", prefix + ext + ".md5" - - @dictify - def _get_log_file(self, action): - assert action == "run" - prefix = ( - "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" - "{mapper}.{var_caller}.variantsqc.{tumor_library}" - ) - - key_ext = ( - ("log", ".log"), - ("conda_info", ".conda_info.txt"), - ("conda_list", ".conda_list.txt"), - ) - for key, ext in key_ext: - yield key, prefix + ext - - def get_resource_usage(self, action): - """Get Resource Usage - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - :return: Returns ResourceUsage for step. - :raises UnsupportedActionException: if action not in class defined list of valid actions. - """ - if action not in self.actions: - actions_str = ", ".join(self.actions) - error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" - raise UnsupportedActionException(error_message) - mem_mb = 4 * 1024 # 4GB - return ResourceUsage( - threads=4, - time="1:00:00", # 1 hour - memory=f"{mem_mb}M", - ) - - -class SomaticVariantQCWorkflow(BaseStep): - """Perform gathering variant information""" - - name = "somaticvariantqc" - sheet_shortcut_class = CancerCaseSheet - sheet_shortcut_kwargs = { - "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) - } - - @classmethod - def default_config_yaml(cls): - """Return default config YAML, to be overwritten by project-specific one.""" - return DEFAULT_CONFIG - - def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): - super().__init__( - workflow, - config, - config_lookup_paths, - config_paths, - workdir, - (SomaticVariantCallingWorkflow, NgsMappingWorkflow), - ) - # Register sub step classes so the sub steps are available - self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart)) - # Register sub workflows - self.register_sub_workflow( - "somatic_variant_calling", - self.w_config["step_config"]["somatic_variant_qc"]["path_somatic_variant_calling"], - ) - # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"]: - self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"] = self.w_config[ - "step_config" - ]["ngs_mapping"]["tools"]["dna"] - if not self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"]: - self.w_config["step_config"]["somatic_variant_qc"][ - "tools_somatic_variant_calling" - ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] - - @listify - def get_result_files(self): - callers = set( - self.w_config["step_config"]["somatic_variant_qc"]["tools_somatic_variant_calling"] - ) - name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}" - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"], - caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), - ext=EXT_VALUES, - ) - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["somatic_variant_qc"]["tools_ngs_mapping"], - caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), - ext=( - ".log", - ".log.md5", - ".conda_info.txt", - ".conda_info.txt.md5", - ".conda_list.txt", - ".conda_list.txt.md5", - ), - ) - - def _yield_result_files_matched(self, tpl, **kwargs): - """Build output paths from path template and extension list. - - This function returns the results from the matched somatic variant callers such as - Mutect. - """ - for sheet in filter(is_not_background, self.shortcut_sheets): - for sample_pair in sheet.all_sample_pairs: - if ( - not sample_pair.tumor_sample.dna_ngs_library - or not sample_pair.normal_sample.dna_ngs_library - ): - msg = ( - "INFO: sample pair for cancer bio sample {} has is missing primary" - "normal or primary cancer NGS library" - ) - print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) - continue - yield from expand( - tpl, - tumor_library=[sample_pair.tumor_sample.dna_ngs_library], - **kwargs, - ) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "somatic_variant_qc", "path_somatic_variant_calling"), - "Path to variant calling not configured but required for somatic variant qc", - ) - - self.ensure_w_config( - ("step_config", "somatic_variant_qc", "target_regions"), - "Path to target_regions file (bed format)" - "not configured but required for somatic variant qc", - ) diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py index c7176d8fa..5b0d893c8 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py +++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py @@ -7,8 +7,8 @@ from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify -from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart -from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart,ResourceUsage +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.somatic_variant_calling import ( SOMATIC_VARIANT_CALLERS_MATCHED, SomaticVariantCallingWorkflow, @@ -134,7 +134,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (SomaticVariantCallingWorkflow, NgsMappingWorkflow), + (SomaticVariantCallingWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((TumorMutationalBurdenCalculationStepPart, LinkOutStepPart)) diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py index 1408cff2a..70213e6a7 100644 --- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py @@ -39,18 +39,18 @@ TMB=`echo "1000000*($number_variants/$total_exom_length)" | bc -l ` cat << EOF > {snakemake.output.json} -{ -"Library_name": {snakemake.wildcards.tumor_library}, -"VCF_file": $name_vcf, -"VCF_md5": $vcf_md5, -"BED_file": $bed_file_name, -"BED_md5": $bed_md5, -"TMB": $TMB, -"Number_variants": $number_variants, -"Number_snvs": $number_snvs, -"Number_indels": $number_indels, -"Total_regions_length": $total_exom_length -} +{{ + "Library_name": "{snakemake.wildcards.tumor_library}", + "VCF_file": "$name_vcf", + "VCF_md5": "$vcf_md5", + "BED_file": "$bed_file_name", + "BED_md5": "$bed_md5", + "TMB": "$TMB", + "Number_variants": "$number_variants", + "Number_snvs": "$number_snvs", + "Number_indels": "$number_indels", + "Total_regions_length": "$total_exom_length" +}} EOF pushd $(dirname {snakemake.output.json}) md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5}) diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml index beb5b1f3f..80b4fa59b 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml +++ b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml @@ -3,6 +3,4 @@ channels: - bioconda dependencies: - cyvcf2 - - pip=20.0.2 - - pip: - - pytabix + - pytabix diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index 38d634d1a..cc150a2d8 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -6,49 +6,11 @@ from cyvcf2 import VCF # for reading vcf file import tabix -parser = argparse.ArgumentParser(description="Gathering information of variants") -parser.add_argument( - "--rawvcf", - help="raw vcf file", -) -parser.add_argument( - "--filtered-vcf", - help="filtered vcf file", -) -parser.add_argument( - "--exom-bedfile", - help="exom regions bed file", -) -parser.add_argument("--padding", help="it's padding", default=0) -parser.add_argument( - "--ignore-regions", - help="bed file contains repeated or difficult to map regions", -) -parser.add_argument( - "-m", - "--minimal", - type=int, - default=1, - help="threshold for defining a variant that has minimal support reads", -) -parser.add_argument( - "-l", - "--limited", - type=int, - default=5, - help="threshold for defining a variant that has limited support reads", -) -parser.add_argument( - "--output", - help="json file contains information about vcf file", -) -args = parser.parse_args() - def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0): try: tmp = [] - records = bed_intervals.query(v_chrom, v_start - padding, v_end + padding) + records = bed_intervals.query(v_chrom, v_start - int(padding), v_end + int(padding)) for record in records: tmp.append(record) if len(tmp) >= 1: @@ -81,7 +43,7 @@ def get_variant_type(ref, alt): def check_sp_read(variant, minimal, limited): - dp = variant.INFO["DP"] + dp = variant.format("AD")[1][0] if dp <= minimal: return "minimal" elif (dp > minimal) and (dp <= limited): @@ -118,6 +80,7 @@ def process_vcf_file( hard_file="", filter_file=False, padding=0, + id_af="AF", ): infor = { "total_v_count": 0, @@ -145,10 +108,10 @@ def process_vcf_file( raise ValueError( "Multiallelic variants are not supported" ) # checking for multiallelic - infor["vaf"].append(float(variant.format("AF")[1][0])) + infor["vaf"].append(float(variant.format(id_af)[1][0])) # Counting variants in or out of exom and variants with different support reads if check_variant_in_bed( - variant.CHROM, variant.start, variant.end, bed_file, padding=0 + variant.CHROM, variant.start, variant.end, bed_file, padding ): infor["v_inside_exom"] += 1 infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_exom"] += 1 @@ -168,7 +131,7 @@ def process_vcf_file( if variant.CHROM in hard_contigs: if check_variant_in_bed( - variant.CHROM, variant.start, variant.end, hard_file, padding=0 + variant.CHROM, variant.start, variant.end, hard_file, padding ): infor["v_in_hard_regions"] += 1 return infor @@ -179,6 +142,47 @@ def process_vcf_file( return total_v_count +parser = argparse.ArgumentParser(description="Gathering information of variants") +parser.add_argument( + "--rawvcf", + help="vcf file containing all called somatic variants, before filtration", + required=True, +) +parser.add_argument( + "--filtered-vcf", help="vcf file containing somatic variants passing all filters", required=True +) +parser.add_argument("--exom-bedfile", help="exom regions bed file", required=True) +parser.add_argument("--padding", type=int, help="Size of padding around exom regions", default=0) +parser.add_argument( + "--ignore-regions", + help="bed file contains repeated or difficult to map regions", +) +parser.add_argument( + "--AF", + default="AF", + help="ID of allele frequency in the vcf file", +) +parser.add_argument( + "-m", + "--minimal", + type=int, + default=1, + help="threshold for defining a variant that has minimal support reads", +) +parser.add_argument( + "-l", + "--limited", + type=int, + default=5, + help="threshold for defining a variant that has limited support reads", +) +parser.add_argument( + "--output", + help="json file contains information about vcf file, if not given, the tool will print it out", +) +args = parser.parse_args() + + def main(): classes = ["T>G", "T>C", "T>A", "C>G", "C>T", "C>A"] # reading input @@ -204,6 +208,7 @@ def main(): hard_intervals, filter_file=True, padding=args.padding, + id_af=args.AF, ) else: infor = process_vcf_file( @@ -216,23 +221,24 @@ def main(): "", filter_file=True, padding=args.padding, + id_af=args.AF, ) summary = { "sample": sample, - "number_full_variants": full_v_count, - "number_passed_variants": infor["total_v_count"], - "number_snvs_inside_exom": infor["v_inside_exom"], - "number_snvs_outside_exom": infor["v_outside_exom"], - "number_snvs_in_hard_regions": infor["v_in_hard_regions"], - "number_snps": infor["n_snps"], + "total_variants_number": full_v_count, + "number_variants_passing_filters": infor["total_v_count"], + "number_variants_in_enrichment_regions": infor["v_inside_exom"], + "number_variants_outside_enrichment_regions": infor["v_outside_exom"], + "number_variants_in_masked_regions": infor["v_in_hard_regions"], + "number_snvs": infor["n_snps"], "number_indels": infor["n_indels"], - "minimal_rp_snvs_exom": infor["minimal_rp_snvs_exom"], - "minimal_rp_snvs_nexom": infor["minimal_rp_snvs_nexom"], - "limited_rp_snvs_exom": infor["limited_rp_snvs_exom"], - "limited_rp_snvs_nexom": infor["limited_rp_snvs_nexom"], - "strong_rp_snvs_exom": infor["strong_rp_snvs_exom"], - "strong_rp_snvs_nexom": infor["strong_rp_snvs_nexom"], + "number_variants_in_enriched_with_minimal_support": infor["minimal_rp_snvs_exom"], + "number_variants_outside_enriched_with_minimal_support": infor["minimal_rp_snvs_nexom"], + "number_variants_in_enriched_with_limited_support": infor["limited_rp_snvs_exom"], + "number_variants_outside_enriched_with_limited_support": infor["limited_rp_snvs_nexom"], + "number_variants_in_enriched_with_strong_support": infor["strong_rp_snvs_exom"], + "number_variants_outside_enriched_with_strong_support": infor["strong_rp_snvs_nexom"], classes[0]: infor["mt_classes"][0], classes[1]: infor["mt_classes"][1], classes[2]: infor["mt_classes"][2], @@ -245,9 +251,12 @@ def main(): if not args.ignore_regions: del summary["number_snvs_in_hard_regions"] json_object = json.dumps(summary) - - with open(args.output, "w") as outfile: - outfile.write(json_object) + if args.output: + with open(args.output, "w") as outfile: + outfile.write(json_object) + outfile.close() + else: + print(json_object) if __name__ == "__main__": diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py index 13295af50..72c686cbf 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py @@ -29,6 +29,7 @@ --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \ --limited {snakemake.config[step_config][somatic_variant_checking][limited_support_read]} \ --ignore-regions {snakemake.config[step_config][somatic_variant_checking][ignore_regions]} \ + --AF {snakemake.config[step_config][somatic_variant_checking][AF_ID]} \ --output {snakemake.output.json} pushd $(dirname {snakemake.output.json}) From 956904f15c127b8913c6cf15495207c5bcfbf482 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Sun, 9 Jul 2023 00:09:44 +0200 Subject: [PATCH 08/17] finalize somatic variants checking --- .../workflows/somatic_variant_checking/__init__.py | 9 +++------ .../wrappers/somatic_variants_checking/summarize-vcf.py | 4 +++- .../wrappers/somatic_variants_checking/wrapper.py | 6 +++--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py index 88254465f..369d52a28 100644 --- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py @@ -76,8 +76,8 @@ def get_output_files(self, action): yield key + "_md5", prefix + ext + ".md5" @dictify - def _get_log_file(self, action): - assert action == "run" + def _get_log_file(self, action): + self._validate_action(action=action) prefix = ( "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" "{mapper}.{var_caller}.variantsqc.{tumor_library}" @@ -98,10 +98,7 @@ def get_resource_usage(self, action): :return: Returns ResourceUsage for step. :raises UnsupportedActionException: if action not in class defined list of valid actions. """ - if action not in self.actions: - actions_str = ", ".join(self.actions) - error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" - raise UnsupportedActionException(error_message) + self._validate_action(action=action) mem_mb = 4 * 1024 # 4GB return ResourceUsage( threads=2, diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index cc150a2d8..dc0bdaf5f 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -128,7 +128,9 @@ def process_vcf_file( infor["n_indels"] += 1 infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0]))) # Gathering information of variants in comparison to hard mapped regions - + else: + infor["v_outside_exom"] += 1 + if variant.CHROM in hard_contigs: if check_variant_in_bed( variant.CHROM, variant.start, variant.end, hard_file, padding diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py index 72c686cbf..856a254ea 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py @@ -40,8 +40,8 @@ # Compute MD5 sums of logs shell( r""" -md5sum {snakemake.log.log} > {snakemake.log.log_md5} -md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5} +md5sum $(basename {snakemake.log.log}) > {snakemake.log.log_md5} +md5sum $(basename {snakemake.log.conda_list}) > {snakemake.log.conda_list_md5} +md5sum $(basename {snakemake.log.conda_info}) > {snakemake.log.conda_info_md5} """ ) From 4ce11597c8f849a0938c17bb3a09239ffded14a2 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Sun, 9 Jul 2023 00:33:59 +0200 Subject: [PATCH 09/17] Fix typo --- .../workflows/somatic_variant_checking/__init__.py | 3 +-- .../workflows/tumor_mutational_burden/__init__.py | 12 ++++-------- .../somatic_variants_checking/summarize-vcf.py | 1 - 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py index 369d52a28..ba2f6e703 100644 --- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py @@ -4,7 +4,6 @@ from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand -from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage @@ -76,7 +75,7 @@ def get_output_files(self, action): yield key + "_md5", prefix + ext + ".md5" @dictify - def _get_log_file(self, action): + def _get_log_file(self, action): self._validate_action(action=action) prefix = ( "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py index 5b0d893c8..6d094f12a 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py +++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py @@ -5,9 +5,8 @@ from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand -from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify -from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart,ResourceUsage +from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart, ResourceUsage from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.somatic_variant_calling import ( SOMATIC_VARIANT_CALLERS_MATCHED, @@ -80,7 +79,7 @@ def get_output_files(self, action): @dictify def _get_log_file(self, action): - assert action == "run" + self._validate_action(action) prefix = ( "work/{mapper}.{var_caller}.tmb.{tumor_library}/log/" "{mapper}.{var_caller}.tmb.{tumor_library}" @@ -101,10 +100,7 @@ def get_resource_usage(self, action): :return: Returns ResourceUsage for step. :raises UnsupportedActionException: if action not in class defined list of valid actions. """ - if action not in self.actions: - actions_str = ", ".join(self.actions) - error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" - raise UnsupportedActionException(error_message) + self._validate_action(action) mem_mb = 4 * 1024 # 4GB return ResourceUsage( threads=2, @@ -134,7 +130,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (SomaticVariantCallingWorkflow, NgsMappingWorkflow), + (SomaticVariantCallingWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((TumorMutationalBurdenCalculationStepPart, LinkOutStepPart)) diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index dc0bdaf5f..484c621a9 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -130,7 +130,6 @@ def process_vcf_file( # Gathering information of variants in comparison to hard mapped regions else: infor["v_outside_exom"] += 1 - if variant.CHROM in hard_contigs: if check_variant_in_bed( variant.CHROM, variant.start, variant.end, hard_file, padding From 1f5b0e0b4f6ddcb43a6137d515661eb84f4c2991 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Sun, 9 Jul 2023 00:37:19 +0200 Subject: [PATCH 10/17] fixing typo --- .../workflows/tumor_mutational_burden/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py index 6d094f12a..0384b8cdb 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py +++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py @@ -6,7 +6,12 @@ from snakemake.io import expand from snappy_pipeline.utils import dictify, listify -from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart, ResourceUsage +from snappy_pipeline.workflows.abstract import ( + BaseStep, + BaseStepPart, + LinkOutStepPart, + ResourceUsage, +) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.somatic_variant_calling import ( SOMATIC_VARIANT_CALLERS_MATCHED, From e99ef3c32df71299055dfd55f9ca42762b9fbd8d Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 13 Jul 2023 16:31:45 +0200 Subject: [PATCH 11/17] fixing small bugs --- .../workflows/somatic_variant_checking/__init__.py | 2 +- snappy_wrappers/wrappers/bcftools/TMB/wrapper.py | 10 +++++----- snappy_wrappers/wrappers/mantis/run/wrapper.py | 2 +- .../somatic_variants_checking/summarize-vcf.py | 8 ++++---- .../wrappers/somatic_variants_checking/wrapper.py | 10 ++++++---- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py index ba2f6e703..d682274af 100644 --- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py @@ -27,7 +27,7 @@ tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling target_regions: # REQUIRED padding: 0 # Used for count the number of variants outside of exom + padding - AF_ID: 'AF' # REQUIRED ID of allele frequency field used in vcf file + variant_allele_frequency_id: 'AF' # REQUIRED ID of allele frequency field used in vcf file ignore_regions: "" # hard mapping regions minimal_support_read: 1 # threshold for defining a variant that has minimal support reads limited_support_read: 5 # threshold for defining a variant that has limited support reads diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py index 70213e6a7..04f48f398 100644 --- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py @@ -45,11 +45,11 @@ "VCF_md5": "$vcf_md5", "BED_file": "$bed_file_name", "BED_md5": "$bed_md5", - "TMB": "$TMB", - "Number_variants": "$number_variants", - "Number_snvs": "$number_snvs", - "Number_indels": "$number_indels", - "Total_regions_length": "$total_exom_length" + "TMB": $TMB, + "Number_variants": $number_variants, + "Number_snvs": $number_snvs, + "Number_indels": $number_indels, + "Total_regions_length": $total_exom_length }} EOF pushd $(dirname {snakemake.output.json}) diff --git a/snappy_wrappers/wrappers/mantis/run/wrapper.py b/snappy_wrappers/wrappers/mantis/run/wrapper.py index 82fa7b8b4..3f2bf4e74 100644 --- a/snappy_wrappers/wrappers/mantis/run/wrapper.py +++ b/snappy_wrappers/wrappers/mantis/run/wrapper.py @@ -33,7 +33,7 @@ # but should also work for reasonable deep WGS according to them. # https://github.com/OSU-SRLab/MANTIS/issues/25 -python /fast/groups/cubi/projects/biotools/Mantis/MANTIS/mantis.py \ +python /fast/groups/cubi/work/projects/biotools/Mantis/MANTIS/mantis.py \ -t {snakemake.input.tumor_bam} \ -n {snakemake.input.normal_bam} \ --genome {snakemake.config[static_data_config][reference][path]} \ diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index 484c621a9..cc33c658f 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -43,7 +43,7 @@ def get_variant_type(ref, alt): def check_sp_read(variant, minimal, limited): - dp = variant.format("AD")[1][0] + dp = variant.format("AD")[1][1] if dp <= minimal: return "minimal" elif (dp > minimal) and (dp <= limited): @@ -159,7 +159,7 @@ def process_vcf_file( help="bed file contains repeated or difficult to map regions", ) parser.add_argument( - "--AF", + "--variant-allele-frequency-id", default="AF", help="ID of allele frequency in the vcf file", ) @@ -209,7 +209,7 @@ def main(): hard_intervals, filter_file=True, padding=args.padding, - id_af=args.AF, + id_af=args.variant_allele_frequency_id, ) else: infor = process_vcf_file( @@ -222,7 +222,7 @@ def main(): "", filter_file=True, padding=args.padding, - id_af=args.AF, + id_af=args.variant_allele_frequency_id, ) summary = { diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py index 856a254ea..6279dba4a 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py @@ -29,19 +29,21 @@ --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \ --limited {snakemake.config[step_config][somatic_variant_checking][limited_support_read]} \ --ignore-regions {snakemake.config[step_config][somatic_variant_checking][ignore_regions]} \ - --AF {snakemake.config[step_config][somatic_variant_checking][AF_ID]} \ + --variant-allele-frequency-id {snakemake.config[step_config][somatic_variant_checking][variant_allele_frequency_id]} \ --output {snakemake.output.json} pushd $(dirname {snakemake.output.json}) md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5}) +popd """ ) # Compute MD5 sums of logs shell( r""" -md5sum $(basename {snakemake.log.log}) > {snakemake.log.log_md5} -md5sum $(basename {snakemake.log.conda_list}) > {snakemake.log.conda_list_md5} -md5sum $(basename {snakemake.log.conda_info}) > {snakemake.log.conda_info_md5} +pushd $(dirname {snakemake.log.log}) +md5sum $(basename {snakemake.log.log}) > $(basename {snakemake.log.log_md5}) +md5sum $(basename {snakemake.log.conda_list}) > $(basename {snakemake.log.conda_list_md5}) +md5sum $(basename {snakemake.log.conda_info}) > $(basename {snakemake.log.conda_info_md5}) """ ) From 62b6e6f539765e5c7541763786360fe1628b1c77 Mon Sep 17 00:00:00 2001 From: ericblanc20 Date: Tue, 25 Jul 2023 14:51:37 +0200 Subject: [PATCH 12/17] fix: json format doesn't accept floats starting with . - [Unintuitive JSON parsing](https://nullprogram.com/blog/2019/12/28/) - [Dollar-parenthesis to be preferred to backticks for POSIX-compliance](https://stackoverflow.com/questions/9405478/command-substitution-backticks-or-dollar-sign-paren-enclosed) --- snappy_wrappers/wrappers/bcftools/TMB/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py index 04f48f398..7dc086f9e 100644 --- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py @@ -37,7 +37,7 @@ number_indels=$(bcftools view -R $bed_file -v indels --threads 2 -H {snakemake.input.vcf}| wc -l) number_variants=$(bcftools view -R $bed_file --threads 2 -H {snakemake.input.vcf}| wc -l) -TMB=`echo "1000000*($number_variants/$total_exom_length)" | bc -l ` +TMB=$(echo "1000000*($number_variants/$total_exom_length)" | bc -l | xargs printf "%.17g") cat << EOF > {snakemake.output.json} {{ "Library_name": "{snakemake.wildcards.tumor_library}", From 5d5246a62bc331cac2efc4ed88883aa6190f47b6 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 27 Jul 2023 14:22:55 +0200 Subject: [PATCH 13/17] Finalizing somatic variant checking --- .../wrappers/bcftools/TMB/wrapper.py | 2 +- .../summarize-vcf.py | 44 +++++++++++++------ .../somatic_variants_checking/wrapper.py | 1 + 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py index 04f48f398..7dc086f9e 100644 --- a/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/TMB/wrapper.py @@ -37,7 +37,7 @@ number_indels=$(bcftools view -R $bed_file -v indels --threads 2 -H {snakemake.input.vcf}| wc -l) number_variants=$(bcftools view -R $bed_file --threads 2 -H {snakemake.input.vcf}| wc -l) -TMB=`echo "1000000*($number_variants/$total_exom_length)" | bc -l ` +TMB=$(echo "1000000*($number_variants/$total_exom_length)" | bc -l | xargs printf "%.17g") cat << EOF > {snakemake.output.json} {{ "Library_name": "{snakemake.wildcards.tumor_library}", diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index cc33c658f..7c57a6602 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -36,14 +36,17 @@ def get_contigs_from_bed_file(bedfile): def get_variant_type(ref, alt): - if (len(alt) == 1) and (len(ref) == len(alt)): - return "snp" - elif len(alt) != len(ref): - return "indels" + if len(ref) == len(alt): + if len(alt) == 1: + return "SNV" + else: + return "ONV" + elif len(alt) < len(ref): + return "indel" -def check_sp_read(variant, minimal, limited): - dp = variant.format("AD")[1][1] +def check_sp_read(variant, pos_sample, minimal, limited): + dp = variant.format("AD")[1][pos_sample] if dp <= minimal: return "minimal" elif (dp > minimal) and (dp <= limited): @@ -72,6 +75,7 @@ def assign_class_snvs(variant, mt_mat): def process_vcf_file( vcf_file, + pos_sample=1, contigs=[], hard_contigs=[], minimal=1, @@ -88,6 +92,7 @@ def process_vcf_file( "v_outside_exom": 0, "v_in_hard_regions": 0, "n_snps": 0, + "n_onvs": 0, "n_indels": 0, "minimal_rp_snvs_exom": 0, "minimal_rp_snvs_nexom": 0, @@ -114,19 +119,25 @@ def process_vcf_file( variant.CHROM, variant.start, variant.end, bed_file, padding ): infor["v_inside_exom"] += 1 - infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_exom"] += 1 + infor[ + str(check_sp_read(variant, pos_sample, minimal, limited)) + "_rp_snvs_exom" + ] += 1 else: infor["v_outside_exom"] += 1 - infor[str(check_sp_read(variant, minimal, limited)) + "_rp_snvs_nexom"] += 1 + infor[ + str(check_sp_read(variant, pos_sample, minimal, limited)) + "_rp_snvs_nexom" + ] += 1 # Need to check multi allelic. Users shouldn't input multi allelic vcf file. - if get_variant_type(variant.REF, variant.ALT[0]) == "snp": + if get_variant_type(variant.REF, variant.ALT[0]) == "SNV": infor["n_snps"] += 1 infor["mt_classes"] = assign_class_snvs(variant, infor["mt_classes"]) - else: + elif get_variant_type(variant.REF, variant.ALT[0]) == "indel": # More for indels infor["n_indels"] += 1 infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0]))) + elif get_variant_type(variant.REF, variant.ALT[0]) == "ONV": + infor["n_onvs"] += 1 # Gathering information of variants in comparison to hard mapped regions else: infor["v_outside_exom"] += 1 @@ -152,6 +163,10 @@ def process_vcf_file( parser.add_argument( "--filtered-vcf", help="vcf file containing somatic variants passing all filters", required=True ) +parser.add_argument( + "--sample", + help="Name of tumor sample", +) parser.add_argument("--exom-bedfile", help="exom regions bed file", required=True) parser.add_argument("--padding", type=int, help="Size of padding around exom regions", default=0) parser.add_argument( @@ -189,7 +204,7 @@ def main(): # reading input full_vcf = VCF(args.rawvcf) filter_vcf = VCF(args.filtered_vcf) - sample = filter_vcf.samples[1] + pos_sample = filter_vcf.samples.index(args.sample) ###################### bed_intervals = tabix.open(args.exom_bedfile) contigs = get_contigs_from_bed_file(args.exom_bedfile) @@ -201,6 +216,7 @@ def main(): hard_contigs = get_contigs_from_bed_file(path_hard_regions) infor = process_vcf_file( filter_vcf, + pos_sample, contigs, hard_contigs, args.minimal, @@ -214,6 +230,7 @@ def main(): else: infor = process_vcf_file( filter_vcf, + pos_sample, contigs, [], args.minimal, @@ -226,13 +243,14 @@ def main(): ) summary = { - "sample": sample, + "sample": args.sample, "total_variants_number": full_v_count, "number_variants_passing_filters": infor["total_v_count"], "number_variants_in_enrichment_regions": infor["v_inside_exom"], "number_variants_outside_enrichment_regions": infor["v_outside_exom"], "number_variants_in_masked_regions": infor["v_in_hard_regions"], "number_snvs": infor["n_snps"], + "number_onvs": infor["n_onvs"], "number_indels": infor["n_indels"], "number_variants_in_enriched_with_minimal_support": infor["minimal_rp_snvs_exom"], "number_variants_outside_enriched_with_minimal_support": infor["minimal_rp_snvs_nexom"], @@ -250,7 +268,7 @@ def main(): "VAF": infor["vaf"], } if not args.ignore_regions: - del summary["number_snvs_in_hard_regions"] + del summary["number_variants_in_masked_regions"] json_object = json.dumps(summary) if args.output: with open(args.output, "w") as outfile: diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py index 6279dba4a..909fe6ac1 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py @@ -24,6 +24,7 @@ python {base_dir}/summarize-vcf.py --rawvcf {snakemake.input.full_vcf} \ --filtered-vcf {snakemake.input.passed_vcf} \ + --sample {snakemake.wildcards.tumor_library} \ --exom-bedfile {snakemake.config[step_config][somatic_variant_checking][target_regions]} \ --padding {snakemake.config[step_config][somatic_variant_checking][padding]} \ --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \ From 4750e75900eab7f066b5dba156138a57cc727253 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 27 Jul 2023 14:32:25 +0200 Subject: [PATCH 14/17] update mantis enviroment --- snappy_wrappers/wrappers/mantis/run/environment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snappy_wrappers/wrappers/mantis/run/environment.yaml b/snappy_wrappers/wrappers/mantis/run/environment.yaml index 8426cdf70..062fb89c3 100644 --- a/snappy_wrappers/wrappers/mantis/run/environment.yaml +++ b/snappy_wrappers/wrappers/mantis/run/environment.yaml @@ -3,5 +3,5 @@ channels: - anaconda - bioconda dependencies: -- numpy ==1.6.2 +- numpy ==1.7.2 - pysam ==0.8.3 From e9cb75b3cea118e761039cc394cb078b21b341a5 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Wed, 2 Aug 2023 12:02:57 +0200 Subject: [PATCH 15/17] finializing somatic_variant_checking --- .../summarize-vcf.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py index 7c57a6602..f3a257c51 100644 --- a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -36,13 +36,17 @@ def get_contigs_from_bed_file(bedfile): def get_variant_type(ref, alt): + ref = ref.replace("-", "") + alt = alt.replace("-", "") + variant_type = "unknown" if len(ref) == len(alt): - if len(alt) == 1: - return "SNV" + if len(ref) == 1: + variant_type = "snv" else: - return "ONV" - elif len(alt) < len(ref): - return "indel" + variant_type = "onv" + else: + variant_type = "indel" + return variant_type def check_sp_read(variant, pos_sample, minimal, limited): @@ -70,7 +74,6 @@ def assign_class_snvs(variant, mt_mat): mt_mat[4] += 1 elif temp in ["G>T", "C>A"]: mt_mat[5] += 1 - return mt_mat def process_vcf_file( @@ -129,14 +132,14 @@ def process_vcf_file( ] += 1 # Need to check multi allelic. Users shouldn't input multi allelic vcf file. - if get_variant_type(variant.REF, variant.ALT[0]) == "SNV": + if get_variant_type(variant.REF, variant.ALT[0]) == "snv": infor["n_snps"] += 1 - infor["mt_classes"] = assign_class_snvs(variant, infor["mt_classes"]) + assign_class_snvs(variant, infor["mt_classes"]) elif get_variant_type(variant.REF, variant.ALT[0]) == "indel": # More for indels infor["n_indels"] += 1 infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0]))) - elif get_variant_type(variant.REF, variant.ALT[0]) == "ONV": + elif get_variant_type(variant.REF, variant.ALT[0]) == "onv": infor["n_onvs"] += 1 # Gathering information of variants in comparison to hard mapped regions else: From e9dbc5e52111daba1a873e410d6b1af68e4bd174 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Thu, 24 Aug 2023 09:42:58 +0200 Subject: [PATCH 16/17] fixing vep environment --- snappy_wrappers/wrappers/vep/run/environment.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/snappy_wrappers/wrappers/vep/run/environment.yaml b/snappy_wrappers/wrappers/vep/run/environment.yaml index 48f171edf..eeb0710b4 100644 --- a/snappy_wrappers/wrappers/vep/run/environment.yaml +++ b/snappy_wrappers/wrappers/vep/run/environment.yaml @@ -1,8 +1,4 @@ channels: - - conda-forge - bioconda dependencies: - - python - ensembl-vep=102 - - htslib - From fdbdfff537b2d585d464219be4b7d8cf352f7ba1 Mon Sep 17 00:00:00 2001 From: giacuong171 Date: Mon, 20 Nov 2023 10:24:26 +0100 Subject: [PATCH 17/17] make isort happy --- snappy_pipeline/workflows/tumor_mutational_burden/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py index 6eb901d8d..e18d6cfd1 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py +++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py @@ -12,7 +12,6 @@ ANNOTATION_TOOLS, SomaticVariantAnnotationWorkflow, ) - from snappy_pipeline.workflows.somatic_variant_calling import ( SOMATIC_VARIANT_CALLERS_MATCHED, SomaticVariantCallingWorkflow,