diff --git a/snappy_pipeline/workflows/somatic_variant_checking/Snakefile b/snappy_pipeline/workflows/somatic_variant_checking/Snakefile new file mode 100644 index 000000000..b73514711 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_checking/Snakefile @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline tumor mutational burden step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_variant_checking import ( + SomaticVariantQCWorkflow, +) + +__author__ = "Gia Cuong Pham" +__email__ = "pham.gia-cuong@bih-charite.de" + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== +wf = SomaticVariantQCWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + + +localrules: + # Linking files from work/ to output/ should be done locally + somatic_variant_qc_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# Generic linking out --------------------------------------------------------- + + +rule somatic_variant_qc_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +rule somatic_variant_qc: + input: + **wf.get_input_files("SVQC_step", "run"), + output: + **wf.get_output_files("SVQC_step", "run"), + threads: wf.get_resource("SVQC_step", "run", "threads") + resources: + time=wf.get_resource("SVQC_step", "run", "time"), + memory=wf.get_resource("SVQC_step", "run", "memory"), + partition=wf.get_resource("SVQC_step", "run", "partition"), + tmpdir=wf.get_resource("SVQC_step", "run", "tmpdir"), + log: + **wf.get_log_file("SVQC_step", "run"), + wrapper: + wf.wrapper_path("somatic_variants_checking") diff --git a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py index aea5ebf23..d682274af 100644 --- a/snappy_pipeline/workflows/somatic_variant_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_checking/__init__.py @@ -1,56 +1,213 @@ -# -*- coding: utf-8 -*- -"""Implementation of the germline ``somatic_variant_checking`` step +import os +import sys -The ``somatic_variant_checking`` step takes as the input the results of the -``somatic_variant_annotation`` step. It then executes various tools computing statistics on the -result files and consistency checks with the pedigrees. +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import expand -.. note:: +from snappy_pipeline.utils import dictify, listify +from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from snappy_pipeline.workflows.somatic_variant_calling import ( + SOMATIC_VARIANT_CALLERS_MATCHED, + SomaticVariantCallingWorkflow, +) - Status: not implemented yet +#: Extensions of files to create as main payload +EXT_VALUES = (".json", ".json.md5") -========== -Step Input -========== +#: Names of the files to create for the extension +EXT_NAMES = ("json", "json_md5") -The variant calling step uses Snakemake sub workflows for using the result of the -``somatic_variant_annotation`` step. - -=========== -Step Output -=========== - -.. note:: TODO - -==================== -Global Configuration -==================== - -.. note:: TODO - -===================== -Default Configuration -===================== - -The default configuration is as follows. - -.. include:: DEFAULT_CONFIG_somatic_variant_checking.rst - -======= -Reports -======= - -Currently, no reports are generated. -""" - -__author__ = "Manuel Holtgrewe " - -#: Available tools for checking variants -VARIANT_CHECKERS = ("bcftools_stats", "peddy") - -#: Default configuration for the somatic_gene_fusion_calling step +#: Default configuration for the tmb calculation step DEFAULT_CONFIG = r""" step_config: - somatic_variant_checking: - path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED + somatic_variant_checking: + path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED + tools_ngs_mapping: [] # default to those configured for ngs_mapping + tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling + target_regions: # REQUIRED + padding: 0 # Used for count the number of variants outside of exom + padding + variant_allele_frequency_id: 'AF' # REQUIRED ID of allele frequency field used in vcf file + ignore_regions: "" # hard mapping regions + minimal_support_read: 1 # threshold for defining a variant that has minimal support reads + limited_support_read: 5 # threshold for defining a variant that has limited support reads """ + + +class SomaticVariantQCStepPart(BaseStepPart): + """""" + + name = "SVQC_step" + + actions = ("run",) + + def __init__(self, parent): + super().__init__(parent) + + @dictify + def get_input_files(self, action): + self._validate_action(action) + tpl = ( + "output/{mapper}.{var_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{tumor_library}" + ) + key_ext = { + "full_vcf": ".full.vcf.gz", + "full_vcf_tbi": ".full.vcf.gz.tbi", + "passed_vcf": ".vcf.gz", + "passed_vcf_tbi": ".vcf.gz.tbi", + } + variant_calling = self.parent.sub_workflows["somatic_variant_calling"] + for key, ext in key_ext.items(): + yield key, variant_calling(tpl + ext) + + @dictify + def get_output_files(self, action): + # Validate action + self._validate_action(action) + prefix = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + key_ext = {"json": ".json"} + for key, ext in key_ext.items(): + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + @dictify + def _get_log_file(self, action): + self._validate_action(action=action) + prefix = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + + def get_resource_usage(self, action): + """Get Resource Usage + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + :return: Returns ResourceUsage for step. + :raises UnsupportedActionException: if action not in class defined list of valid actions. + """ + self._validate_action(action=action) + mem_mb = 4 * 1024 # 4GB + return ResourceUsage( + threads=2, + time="1:00:00", # 1 hour + memory=f"{mem_mb}M", + ) + + +class SomaticVariantQCWorkflow(BaseStep): + """Perform gathering variant information""" + + name = "somaticvariantchecking" + sheet_shortcut_class = CancerCaseSheet + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one.""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + (SomaticVariantCallingWorkflow, NgsMappingWorkflow), + ) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes((SomaticVariantQCStepPart, LinkOutStepPart)) + # Register sub workflows + self.register_sub_workflow( + "somatic_variant_calling", + self.w_config["step_config"]["somatic_variant_checking"][ + "path_somatic_variant_calling" + ], + ) + # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here + if not self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"]: + self.w_config["step_config"]["somatic_variant_checking"][ + "tools_ngs_mapping" + ] = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + if not self.w_config["step_config"]["somatic_variant_checking"][ + "tools_somatic_variant_calling" + ]: + self.w_config["step_config"]["somatic_variant_checking"][ + "tools_somatic_variant_calling" + ] = self.w_config["step_config"]["somatic_variant_calling"]["tools"] + + @listify + def get_result_files(self): + callers = set( + self.w_config["step_config"]["somatic_variant_checking"][ + "tools_somatic_variant_calling" + ] + ) + name_pattern = "{mapper}.{caller}.variantsqc.{tumor_library.name}" + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=EXT_VALUES, + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["somatic_variant_checking"]["tools_ngs_mapping"], + caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) + + def _yield_result_files_matched(self, tpl, **kwargs): + """Build output paths from path template and extension list. + + This function returns the results from the matched somatic variant callers such as + Mutect. + """ + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if not sample_pair.tumor_sample.dna_ngs_library: + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + yield from expand( + tpl, + tumor_library=[sample_pair.tumor_sample.dna_ngs_library], + **kwargs, + ) + + def check_config(self): + """Check that the path to the NGS mapping is present""" + self.ensure_w_config( + ("step_config", "somatic_variant_checking", "path_somatic_variant_calling"), + "Path to variant calling not configured but required for somatic variant qc", + ) + + self.ensure_w_config( + ("step_config", "somatic_variant_checking", "target_regions"), + "Path to target_regions file (bed format)" + "not configured but required for somatic variant qc", + ) diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile index 4ce18d68d..c748a6813 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile +++ b/snappy_pipeline/workflows/tumor_mutational_burden/Snakefile @@ -8,7 +8,8 @@ from snappy_pipeline.workflows.tumor_mutational_burden import ( TumorMutationalBurdenCalculationWorkflow, ) -__author__ = "" +__author__ = "Gia Cuong Pham" +__email__ = "pham.gia-cuong@bih-charite.de" # Configuration =============================================================== diff --git a/snappy_wrappers/wrappers/mantis/run/environment.yaml b/snappy_wrappers/wrappers/mantis/run/environment.yaml index 8426cdf70..062fb89c3 100644 --- a/snappy_wrappers/wrappers/mantis/run/environment.yaml +++ b/snappy_wrappers/wrappers/mantis/run/environment.yaml @@ -3,5 +3,5 @@ channels: - anaconda - bioconda dependencies: -- numpy ==1.6.2 +- numpy ==1.7.2 - pysam ==0.8.3 diff --git a/snappy_wrappers/wrappers/mantis/run/wrapper.py b/snappy_wrappers/wrappers/mantis/run/wrapper.py index 82fa7b8b4..3f2bf4e74 100644 --- a/snappy_wrappers/wrappers/mantis/run/wrapper.py +++ b/snappy_wrappers/wrappers/mantis/run/wrapper.py @@ -33,7 +33,7 @@ # but should also work for reasonable deep WGS according to them. # https://github.com/OSU-SRLab/MANTIS/issues/25 -python /fast/groups/cubi/projects/biotools/Mantis/MANTIS/mantis.py \ +python /fast/groups/cubi/work/projects/biotools/Mantis/MANTIS/mantis.py \ -t {snakemake.input.tumor_bam} \ -n {snakemake.input.normal_bam} \ --genome {snakemake.config[static_data_config][reference][path]} \ diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml new file mode 100644 index 000000000..80b4fa59b --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_variants_checking/environment.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - cyvcf2 + - pytabix diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py new file mode 100644 index 000000000..f3a257c51 --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_variants_checking/summarize-vcf.py @@ -0,0 +1,285 @@ +# import packages +import argparse +import gzip +import json # for writing out the json file + +from cyvcf2 import VCF # for reading vcf file +import tabix + + +def check_variant_in_bed(v_chrom, v_start, v_end, bed_intervals, padding=0): + try: + tmp = [] + records = bed_intervals.query(v_chrom, v_start - int(padding), v_end + int(padding)) + for record in records: + tmp.append(record) + if len(tmp) >= 1: + return True + else: + return False + except OSError: + print("You should index the bed file with tabix first!") + + +def get_contigs_from_bed_file(bedfile): + contigs = [] + try: + with gzip.open(bedfile, "r") as intervals: + for interval in intervals: + line = interval.decode() + tmp = line.split("\t")[0] + if tmp not in contigs: + contigs.append(tmp) + return contigs + except OSError: + print("Error reading the GZIP file.") + + +def get_variant_type(ref, alt): + ref = ref.replace("-", "") + alt = alt.replace("-", "") + variant_type = "unknown" + if len(ref) == len(alt): + if len(ref) == 1: + variant_type = "snv" + else: + variant_type = "onv" + else: + variant_type = "indel" + return variant_type + + +def check_sp_read(variant, pos_sample, minimal, limited): + dp = variant.format("AD")[1][pos_sample] + if dp <= minimal: + return "minimal" + elif (dp > minimal) and (dp <= limited): + return "limited" + elif dp >= limited: + return "strong" + + +def assign_class_snvs(variant, mt_mat): + temp = str(variant.REF) + ">" + str(variant.ALT[0]) + # Following result from plot-vcfstats with bcftools + if temp in ["A>C", "T>G"]: + mt_mat[0] += 1 + elif temp in ["A>G", "T>C"]: + mt_mat[1] += 1 + elif temp in ["A>T", "T>A"]: + mt_mat[2] += 1 + elif temp in ["C>G", "G>C"]: + mt_mat[3] += 1 + elif temp in ["C>T", "G>A"]: + mt_mat[4] += 1 + elif temp in ["G>T", "C>A"]: + mt_mat[5] += 1 + + +def process_vcf_file( + vcf_file, + pos_sample=1, + contigs=[], + hard_contigs=[], + minimal=1, + limited=5, + bed_file="", + hard_file="", + filter_file=False, + padding=0, + id_af="AF", +): + infor = { + "total_v_count": 0, + "v_inside_exom": 0, + "v_outside_exom": 0, + "v_in_hard_regions": 0, + "n_snps": 0, + "n_onvs": 0, + "n_indels": 0, + "minimal_rp_snvs_exom": 0, + "minimal_rp_snvs_nexom": 0, + "limited_rp_snvs_exom": 0, + "limited_rp_snvs_nexom": 0, + "strong_rp_snvs_exom": 0, + "strong_rp_snvs_nexom": 0, + "indels_length": [], + "mt_classes": [0] * 6, + "vaf": [], + } + if filter_file: + for variant in vcf_file: + infor["total_v_count"] += 1 + # Gathering information of variants in comparison to interested regions + if variant.CHROM in contigs: + if len(variant.ALT) > 1: + raise ValueError( + "Multiallelic variants are not supported" + ) # checking for multiallelic + infor["vaf"].append(float(variant.format(id_af)[1][0])) + # Counting variants in or out of exom and variants with different support reads + if check_variant_in_bed( + variant.CHROM, variant.start, variant.end, bed_file, padding + ): + infor["v_inside_exom"] += 1 + infor[ + str(check_sp_read(variant, pos_sample, minimal, limited)) + "_rp_snvs_exom" + ] += 1 + else: + infor["v_outside_exom"] += 1 + infor[ + str(check_sp_read(variant, pos_sample, minimal, limited)) + "_rp_snvs_nexom" + ] += 1 + + # Need to check multi allelic. Users shouldn't input multi allelic vcf file. + if get_variant_type(variant.REF, variant.ALT[0]) == "snv": + infor["n_snps"] += 1 + assign_class_snvs(variant, infor["mt_classes"]) + elif get_variant_type(variant.REF, variant.ALT[0]) == "indel": + # More for indels + infor["n_indels"] += 1 + infor["indels_length"].append(abs(len(variant.REF) - len(variant.ALT[0]))) + elif get_variant_type(variant.REF, variant.ALT[0]) == "onv": + infor["n_onvs"] += 1 + # Gathering information of variants in comparison to hard mapped regions + else: + infor["v_outside_exom"] += 1 + if variant.CHROM in hard_contigs: + if check_variant_in_bed( + variant.CHROM, variant.start, variant.end, hard_file, padding + ): + infor["v_in_hard_regions"] += 1 + return infor + else: + total_v_count = 0 + for variant in vcf_file: + total_v_count += 1 + return total_v_count + + +parser = argparse.ArgumentParser(description="Gathering information of variants") +parser.add_argument( + "--rawvcf", + help="vcf file containing all called somatic variants, before filtration", + required=True, +) +parser.add_argument( + "--filtered-vcf", help="vcf file containing somatic variants passing all filters", required=True +) +parser.add_argument( + "--sample", + help="Name of tumor sample", +) +parser.add_argument("--exom-bedfile", help="exom regions bed file", required=True) +parser.add_argument("--padding", type=int, help="Size of padding around exom regions", default=0) +parser.add_argument( + "--ignore-regions", + help="bed file contains repeated or difficult to map regions", +) +parser.add_argument( + "--variant-allele-frequency-id", + default="AF", + help="ID of allele frequency in the vcf file", +) +parser.add_argument( + "-m", + "--minimal", + type=int, + default=1, + help="threshold for defining a variant that has minimal support reads", +) +parser.add_argument( + "-l", + "--limited", + type=int, + default=5, + help="threshold for defining a variant that has limited support reads", +) +parser.add_argument( + "--output", + help="json file contains information about vcf file, if not given, the tool will print it out", +) +args = parser.parse_args() + + +def main(): + classes = ["T>G", "T>C", "T>A", "C>G", "C>T", "C>A"] + # reading input + full_vcf = VCF(args.rawvcf) + filter_vcf = VCF(args.filtered_vcf) + pos_sample = filter_vcf.samples.index(args.sample) + ###################### + bed_intervals = tabix.open(args.exom_bedfile) + contigs = get_contigs_from_bed_file(args.exom_bedfile) + full_v_count = process_vcf_file(full_vcf) + # read bed file + if args.ignore_regions: + path_hard_regions = args.ignore_regions + hard_intervals = tabix.open(path_hard_regions) + hard_contigs = get_contigs_from_bed_file(path_hard_regions) + infor = process_vcf_file( + filter_vcf, + pos_sample, + contigs, + hard_contigs, + args.minimal, + args.limited, + bed_intervals, + hard_intervals, + filter_file=True, + padding=args.padding, + id_af=args.variant_allele_frequency_id, + ) + else: + infor = process_vcf_file( + filter_vcf, + pos_sample, + contigs, + [], + args.minimal, + args.limited, + bed_intervals, + "", + filter_file=True, + padding=args.padding, + id_af=args.variant_allele_frequency_id, + ) + + summary = { + "sample": args.sample, + "total_variants_number": full_v_count, + "number_variants_passing_filters": infor["total_v_count"], + "number_variants_in_enrichment_regions": infor["v_inside_exom"], + "number_variants_outside_enrichment_regions": infor["v_outside_exom"], + "number_variants_in_masked_regions": infor["v_in_hard_regions"], + "number_snvs": infor["n_snps"], + "number_onvs": infor["n_onvs"], + "number_indels": infor["n_indels"], + "number_variants_in_enriched_with_minimal_support": infor["minimal_rp_snvs_exom"], + "number_variants_outside_enriched_with_minimal_support": infor["minimal_rp_snvs_nexom"], + "number_variants_in_enriched_with_limited_support": infor["limited_rp_snvs_exom"], + "number_variants_outside_enriched_with_limited_support": infor["limited_rp_snvs_nexom"], + "number_variants_in_enriched_with_strong_support": infor["strong_rp_snvs_exom"], + "number_variants_outside_enriched_with_strong_support": infor["strong_rp_snvs_nexom"], + classes[0]: infor["mt_classes"][0], + classes[1]: infor["mt_classes"][1], + classes[2]: infor["mt_classes"][2], + classes[3]: infor["mt_classes"][3], + classes[4]: infor["mt_classes"][4], + classes[5]: infor["mt_classes"][5], + "indels_length": infor["indels_length"], + "VAF": infor["vaf"], + } + if not args.ignore_regions: + del summary["number_variants_in_masked_regions"] + json_object = json.dumps(summary) + if args.output: + with open(args.output, "w") as outfile: + outfile.write(json_object) + outfile.close() + else: + print(json_object) + + +if __name__ == "__main__": + main() diff --git a/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py new file mode 100644 index 000000000..909fe6ac1 --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_variants_checking/wrapper.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +"""Wrapper for summarize information of vcf file after +somatic variant calling step +""" +import os + +from snakemake.shell import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +base_dir = os.path.dirname(os.path.realpath(__file__)) +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +exec 2> >(tee -a "{snakemake.log.log}") +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +python {base_dir}/summarize-vcf.py --rawvcf {snakemake.input.full_vcf} \ + --filtered-vcf {snakemake.input.passed_vcf} \ + --sample {snakemake.wildcards.tumor_library} \ + --exom-bedfile {snakemake.config[step_config][somatic_variant_checking][target_regions]} \ + --padding {snakemake.config[step_config][somatic_variant_checking][padding]} \ + --minimal {snakemake.config[step_config][somatic_variant_checking][minimal_support_read]} \ + --limited {snakemake.config[step_config][somatic_variant_checking][limited_support_read]} \ + --ignore-regions {snakemake.config[step_config][somatic_variant_checking][ignore_regions]} \ + --variant-allele-frequency-id {snakemake.config[step_config][somatic_variant_checking][variant_allele_frequency_id]} \ + --output {snakemake.output.json} + +pushd $(dirname {snakemake.output.json}) +md5sum $(basename {snakemake.output.json}) > $(basename {snakemake.output.json_md5}) +popd + """ +) + +# Compute MD5 sums of logs +shell( + r""" +pushd $(dirname {snakemake.log.log}) +md5sum $(basename {snakemake.log.log}) > $(basename {snakemake.log.log_md5}) +md5sum $(basename {snakemake.log.conda_list}) > $(basename {snakemake.log.conda_list_md5}) +md5sum $(basename {snakemake.log.conda_info}) > $(basename {snakemake.log.conda_info_md5}) +""" +) diff --git a/snappy_wrappers/wrappers/vep/run/environment.yaml b/snappy_wrappers/wrappers/vep/run/environment.yaml index 711e58eec..eeb0710b4 100644 --- a/snappy_wrappers/wrappers/vep/run/environment.yaml +++ b/snappy_wrappers/wrappers/vep/run/environment.yaml @@ -2,4 +2,3 @@ channels: - bioconda dependencies: - ensembl-vep=102 - diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py new file mode 100644 index 000000000..2882fbd2b --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_checking.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- +"""Tests for the somatic_variant_calling workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml + +from snappy_pipeline.workflows.somatic_variant_checking import SomaticVariantQCWorkflow + +from .common import get_expected_log_files_dict, get_expected_output_json_files_dict +from .conftest import patch_module_fs + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + compute_coverage_bed: true + path_target_regions: /path/to/regions.bed + bwa: + path_index: /path/to/bwa/index.fa + + somatic_variant_calling: + tools: + - mutect2 + - scalpel + scalpel: + path_target_regions: /path/to/target/regions.bed + + somatic_variant_checking: + path_somatic_variant_calling: ../somatic_variant_calling + tools_ngs_mapping: [] + tools_somatic_variant_calling: [] + target_regions: /path/to/regions.bed + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def somatic_variant_checking_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + """Return SomaticVariantCheckingWorkflow object pre-configured with cancer sheet""" + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variant_calling": lambda x: "SOMATIC_VARIANT_CALLING/" + x, + } + # Construct the workflow object + return SomaticVariantQCWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for SomaticVariantCheckingStepPart ----------------------------------------------------- + + +def test_somatic_variant_checking_step_part_get_input_files(somatic_variant_checking_workflow): + """Test SomaticVariantCheckingStepPart.get_input_files()""" + base_out = ( + "SOMATIC_VARIANT_CALLING/output/{mapper}.{var_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{tumor_library}" + ) + expected = { + "full_vcf": base_out + ".full.vcf.gz", + "full_vcf_tbi": base_out + ".full.vcf.gz.tbi", + "passed_vcf": base_out + ".vcf.gz", + "passed_vcf_tbi": base_out + ".vcf.gz.tbi", + } + actual = somatic_variant_checking_workflow.get_input_files("SVQC_step", "run") + assert actual == expected + + +def test_somatic_variant_checking_step_part_get_output_files(somatic_variant_checking_workflow): + """Tests SomaticVariantCheckingStepPart.get_output_files()""" + base_out = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/out/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + expected = get_expected_output_json_files_dict(base_out=base_out) + actual = somatic_variant_checking_workflow.get_output_files("SVQC_step", "run") + assert actual == expected + + +def test_somatic_variant_checking_step_part_get_log_files(somatic_variant_checking_workflow): + """Tests SomaticVariantCheckingStepPart.get_log_files()""" + base_out = ( + "work/{mapper}.{var_caller}.variantsqc.{tumor_library}/log/" + "{mapper}.{var_caller}.variantsqc.{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = somatic_variant_checking_workflow.get_log_file("SVQC_step", "run") + assert actual == expected + + +def test_somatic_variant_checking_step_part_get_resource_usage( + somatic_variant_checking_workflow, +): + """Tests SomaticVariantCheckingStepPart.get_resource_usage()""" + # Define expected + expected_dict = {"threads": 2, "time": "1:00:00", "memory": "4096M"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_variant_checking_workflow.get_resource("SVQC_step", "run", resource) + assert actual == expected, msg_error + + +# Tests for SomaticVariantCheckingWorkflow ------------------------------------------------------- + + +def test_somatic_variant_checking_workflow(somatic_variant_checking_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["SVQC_step", "link_out"] + actual = list(sorted(somatic_variant_checking_workflow.sub_steps.keys())) + assert actual == expected + + # Check result file construction + tpl = ( + "output/{mapper}.{var_caller}.variantsqc.P00{i}-T{t}-DNA1-WGS1/{dir_}/" + "{mapper}.{var_caller}.variantsqc.P00{i}-T{t}-DNA1-WGS1.{ext}" + ) + expected = [ + tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext, dir_="out") + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ("json", "json.md5") + for mapper in ("bwa",) + for var_caller in ("mutect2", "scalpel") + ] + expected += [ + tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext, dir_="log") + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + "conda_info.txt", + "conda_list.txt", + "log", + "conda_info.txt.md5", + "conda_list.txt.md5", + "log.md5", + ) + for mapper in ("bwa",) + for var_caller in ("mutect2", "scalpel") + ] + expected = list(sorted(expected)) + actual = list(sorted(somatic_variant_checking_workflow.get_result_files())) + assert expected == actual